Trabajo Practico¶

Desarrollo de Modelos Analíticos (Machine Learning) K5051 -2024¶

Grupo 3¶

Choque Llanqui, Edson Gustavo¶

Nigliazzo, Matias Ezequiel¶

Sanchez, Tomas Agustin¶

In [ ]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
import numpy as np
pd.set_option('future.no_silent_downcasting', True)
data = pd.read_csv(r'data/data.csv', sep='|', dtype={'client_id':int},skipfooter = 1, engine='python')
In [ ]:
data.shape
Out[ ]:
(238615, 77)
In [ ]:
[x for x in data.columns if x.startswith('Saving')]
Out[ ]:
['SavingAccount_Active_ARG_Salary',
 'SavingAccount_Active_ARG',
 'SavingAccount_Active_DOLLAR',
 'SavingAccount_Balance_FirstDate',
 'SavingAccount_Balance_LastDate',
 'SavingAccount_Balance_Average',
 'SavingAccount_Days_with_use',
 'SavingAccount_Days_with_Credits',
 'SavingAccount_Days_with_Debits',
 'SavingAccount_Salary_Payment_Transactions',
 'SavingAccount_Transfer_In_Transactions',
 'SavingAccount_ATM_Extraction_Transactions',
 'SavingAccount_Service_Payment_Transactions',
 'SavingAccount_CreditCard_Payment_Transactions',
 'SavingAccount_Transfer_Out_Transactions',
 'SavingAccount_DebitCard_Spend_Transactions',
 'SavingAccount_Transactions_Transactions',
 'SavingAccount_Credits_Transactions',
 'SavingAccount_Debits_Transactions',
 'SavingAccount_Salary_Payment_Amount',
 'SavingAccount_Transfer_In_Amount',
 'SavingAccount_ATM_Extraction_Amount',
 'SavingAccount_Service_Payment_Amount',
 'SavingAccount_CreditCard_Payment_Amount',
 'SavingAccount_Transfer_Out_Amount',
 'SavingAccount_DebitCard_Spend_Amount',
 'SavingAccount_Total_Amount',
 'SavingAccount_Credits_Amounts',
 'SavingAccount_Debits_Amounts']
In [ ]:
data.Month.value_counts()
Out[ ]:
Month
2019-04-01    26547
2019-03-01    26547
2019-02-01    26512
2019-01-01    26503
2018-08-01    26502
2018-10-01    26501
2018-09-01    26501
2018-12-01    26501
2018-11-01    26501
Name: count, dtype: int64
In [ ]:
len(data.client_id.unique())
Out[ ]:
26560
In [ ]:
#clientes en la base 26560
In [ ]:
data_9m = pd.DataFrame(data.client_id.value_counts().reset_index())
data_9m.columns = ['client_id','cantidad_meses']
In [ ]:
data_9m
Out[ ]:
client_id cantidad_meses
0 5856970 9
1 5895899 9
2 4712252 9
3 7304330 9
4 6657428 9
... ... ...
26555 6623284 1
26556 4424661 1
26557 5643352 1
26558 6641590 1
26559 264018 1

26560 rows × 2 columns

In [ ]:
data_9m[data_9m.cantidad_meses == 9].shape
Out[ ]:
(26483, 2)
In [ ]:
#me quedo con los de 9 meses
data_9m = data_9m[data_9m.cantidad_meses == 9].copy()
In [ ]:
#sin paquetes en el ultimo mes
In [ ]:
data.Package_Active.value_counts()
Out[ ]:
Package_Active
No     234177
Yes      4438
Name: count, dtype: int64
In [ ]:
#tenemos 9 meses, pero usaremos 2 para la prediccion, 1 para lead window y el resto pata el entrenamiento
data_sin_paquete = data[(data.Package_Active == 'No') & (data.Month == '2019-01-01')][['client_id']]

data_sin_paquete.shape
Out[ ]:
(26026, 1)
In [ ]:
#condicion comercial, sera que el cliente tenga cobranding( que es lo de coto, cencusud, etc)
In [ ]:
len(data[(data.CreditCard_CoBranding == 'Yes') & (data.Month == '2019-01-01')].client_id.unique())
Out[ ]:
2843
In [ ]:
#prediction Window
In [ ]:
data.Target.value_counts()
Out[ ]:
Target
0.0    176359
1.0     62256
Name: count, dtype: int64
In [ ]:
data_Target = data[(data.Target == 1) & (data.Month.isin(['2019-04-01','2019-03-01']))][['client_id']].drop_duplicates()
data_Target.shape

data_Target['TGT'] = 1
In [ ]:
data_cobranding = data[(data.CreditCard_CoBranding == 'Yes') & (data.Month == '2019-01-01')][['client_id']]
data_cobranding.shape
Out[ ]:
(2843, 1)
In [ ]:
data_cruce_cobranding = data_cobranding.merge(data_Target, how='left', on='client_id').fillna(0)

data_cruce_cobranding.TGT.value_counts()
Out[ ]:
TGT
0.0    2836
1.0       7
Name: count, dtype: int64
In [ ]:
# como vemos, los que tienen cobranding y con target 1, son solo 7. Esos no los voy a usar, son muy pocos, ademas
# hay una constraint de negocio de los bancos, que dice que a los que tienen cobranding, no les doy paquetes
#porque no tienen  mucha plata
In [ ]:
## asi que lo que necesitaria es para cumplir el objetivo de "Vender paquetes" es:
# Cruzar todo y quedarse con los clientes aptos:
# 9 mese de data (no le vendo a los clientes nuevos)
# sin cobranding (porque si tiene cobranding se que no les voy a vender a ellos)
# sin paquete activo (no le voy a vender un producto que ya tiene)
In [ ]:
data_cobranding_No = data[(data.CreditCard_CoBranding == 'No') & (data.Month == '2019-01-01')][['client_id']]
In [ ]:
data_cobranding_No
Out[ ]:
client_id
2 5928737
10 6018047
11 5359038
16 6890812
20 115383
... ...
238573 6570413
238574 6258895
238585 6397274
238586 6007291
238612 6412619

23660 rows × 1 columns

In [ ]:
universo = data_9m.merge(data_sin_paquete, how='inner', on='client_id')\
                .merge(data_cobranding_No, how='inner', on='client_id')\
                .merge(data_Target, how='left', on='client_id').fillna(0)

universo.TGT.value_counts()
            
Out[ ]:
TGT
0.0    16368
1.0     6823
Name: count, dtype: int64
In [ ]:
#en nuestro universo el 30% de las personas compran paquetes

Exploratory Data Analysis¶

In [ ]:
training_window = data[(data.Month.isin(['2018-08-01','2018-09-01','2018-10-01','2018-11-01','2018-12-01','2019-01-01']))]
training_window.shape
Out[ ]:
(159009, 77)
In [ ]:
training_window
Out[ ]:
client_id Target Month First_product_dt Last_product_dt CreditCard_Premium CreditCard_Active CreditCard_CoBranding Loan_Active Mortgage_Active ... CreditCard_Payment_External CreditCard_Payment_Cash CreditCard_Payment_Web CreditCard_Payment_ATM CreditCard_Payment_TAS Investment_Numbers Mobile Email Region CreditCard_Product
0 5856970 1.0 2018-10-01 2013-10-23 2019-01-10 No Yes No No No ... 0.0 0.0 0.0 0.0 0.0 1.0 Yes Yes NaN NaN
1 6371753 0.0 2018-09-01 2015-07-29 2018-06-02 No No No No No ... 0.0 0.0 0.0 0.0 0.0 0.0 Yes No NaN NaN
2 5928737 0.0 2019-01-01 2016-08-31 2018-12-27 No No No No No ... 0.0 0.0 0.0 0.0 0.0 0.0 Yes Yes NaN NaN
3 475064 0.0 2018-12-01 2014-07-13 2017-11-30 No Yes No No No ... 0.0 0.0 0.0 0.0 0.0 0.0 Yes Yes NaN NaN
4 3615172 0.0 2018-09-01 2017-12-27 2017-12-28 No No No No No ... 0.0 0.0 0.0 0.0 0.0 0.0 Yes No NaN NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
238597 1673642 0.0 2018-11-01 2017-08-18 2017-09-26 No Yes No No No ... 0.0 0.0 1.0 0.0 0.0 0.0 No Yes NaN NaN
238603 6145735 1.0 2018-11-01 2014-10-26 2014-10-26 No Yes No No No ... 0.0 0.0 1.0 0.0 0.0 0.0 Yes Yes NaN NaN
238604 5638786 1.0 2018-11-01 2012-12-26 2017-03-08 No Yes No No No ... 0.0 0.0 0.0 0.0 1.0 0.0 Yes No NaN NaN
238608 3824781 0.0 2018-08-01 2014-11-27 2019-01-04 No No No No No ... 0.0 0.0 0.0 0.0 0.0 0.0 Yes Yes NaN NaN
238612 6412619 0.0 2019-01-01 2015-07-08 2018-06-02 No No No Yes No ... 0.0 0.0 0.0 0.0 0.0 0.0 Yes No NaN NaN

159009 rows × 77 columns

In [ ]:
training_window.client_id.value_counts()
Out[ ]:
client_id
5856970    6
3852147    6
1129478    6
711456     6
2531821    6
          ..
6264756    2
5124642    1
6623284    1
1419642    1
6858355    1
Name: count, Length: 26509, dtype: int64
In [ ]:
#para quitar esos clientes que no tienen 6 registros,
#cruzare mi training_window(estos serian 6 meses) con el universo de trabajo(estos son nueve meses)
In [ ]:
training_window = training_window.merge(universo, how='inner', on='client_id')
In [ ]:
training_window.client_id.value_counts()
Out[ ]:
client_id
5856970    6
2428341    6
6454439    6
5678167    6
7308381    6
          ..
6210931    6
2758381    6
6348905    6
5982253    6
5967858    6
Name: count, Length: 23191, dtype: int64

Identity features del ultimo mes de training window¶

In [ ]:
#crear data frame con IF
In [ ]:
dfIdentityFeatures = training_window[training_window.Month == '2019-01-01'][['client_id','Target','Investment_Numbers','CreditCard_Total_Limit']]
dfIdentityFeatures
Out[ ]:
client_id Target Investment_Numbers CreditCard_Total_Limit
2 5928737 0.0 0.0 0.0
8 6018047 1.0 0.0 80000.0
9 5359038 1.0 0.0 64000.0
11 6890812 0.0 0.0 0.0
13 115383 0.0 0.0 0.0
... ... ... ... ...
139131 6570413 0.0 0.0 28000.0
139132 6258895 0.0 0.0 0.0
139138 6397274 0.0 0.0 40000.0
139139 6007291 0.0 0.0 96000.0
139145 6412619 0.0 0.0 0.0

23191 rows × 4 columns

Transform Features del ultimo mes de training window¶

In [ ]:
dfTransformFeatures = training_window[training_window.Month == '2019-01-01'][['client_id']]

columnas = ['CreditCard_Premium','CreditCard_Active','CreditCard_CoBranding','Loan_Active',
            'Mortgage_Active', 'SavingAccount_Active_ARG_Salary','SavingAccount_Active_ARG','SavingAccount_Active_DOLLAR'
           ,'DebitCard_Active','Investment_Active','Package_Active','Insurance_Life'
           ,'Insurance_Home','Insurance_Accidents','Insurance_Mobile','Insurance_ATM','Insurance_Unemployment']

for columna in columnas:
    dfTransformFeatures[columna] = np.where(training_window[training_window.Month == '2019-01-01'][columna] == 'Yes',1,0)
#----------    

dfTransformFeatures['Sex'] = np.where(training_window[training_window.Month == '2019-01-01']['Sex'] == 'F', 0, 1)

#--------
TiposSegurosColumnas = [x for x in training_window.columns if x.startswith('Insurance')]

dfInsurance = training_window[training_window.Month == '2019-01-01'][['client_id']]
for columna in TiposSegurosColumnas:
    dfInsurance[columna] = np.where(training_window[training_window.Month == '2019-01-01'][columna] == 'Yes',1,0)

dfTransformFeatures['Total_Seguros_del_UltimosMes'] = dfInsurance[TiposSegurosColumnas[0]] + dfInsurance[TiposSegurosColumnas[1]] + dfInsurance[TiposSegurosColumnas[2]] +dfInsurance[TiposSegurosColumnas[3]] +dfInsurance[TiposSegurosColumnas[4]]+dfInsurance[TiposSegurosColumnas[5]]


#---------------
dfTransformFeatures['Client_Age_grp'] = training_window[training_window.Month == '2019-01-01'][['Client_Age_grp']]

di = { 
 "Entre 40 y 49 años"          : 40,
 "Entre 30 y 39 años"                 : 30,
    "Entre 50 y 59 años"                 : 50,
    "Entre 60 y 64 años"                 : 60,
    "Entre 65 y 69 años"                 : 65,
    "Entre 18 y 29 años"                 : 18,
    "Mayor a 70 años"                 : 70,
    "Menor a 18 años"                 : 17,
 }
dfTransformFeatures.Client_Age_grp = dfTransformFeatures.Client_Age_grp.map(di)

#-----------------

#dfTransformFeatures['MaxSavingAccount_ATM_Extraction_Last6'] = tr

# total cantidad de seguros del ultimo mes,  grupos de edades ponerlos en numeros, maxima extraccion de atm de los ultmos 6 meses
dfTransformFeatures
Out[ ]:
client_id CreditCard_Premium CreditCard_Active CreditCard_CoBranding Loan_Active Mortgage_Active SavingAccount_Active_ARG_Salary SavingAccount_Active_ARG SavingAccount_Active_DOLLAR DebitCard_Active ... Package_Active Insurance_Life Insurance_Home Insurance_Accidents Insurance_Mobile Insurance_ATM Insurance_Unemployment Sex Total_Seguros_del_UltimosMes Client_Age_grp
2 5928737 0 0 0 0 0 0 1 1 1 ... 0 0 0 0 0 0 0 1 0 30
8 6018047 1 1 0 0 0 0 1 0 1 ... 0 0 0 0 0 0 0 1 0 60
9 5359038 1 1 0 1 0 1 1 1 1 ... 0 0 0 0 0 0 0 1 0 40
11 6890812 0 0 0 1 0 0 1 1 1 ... 0 0 0 0 0 0 0 1 0 40
13 115383 0 0 0 0 0 0 1 0 0 ... 0 0 0 0 0 0 0 1 0 70
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
139131 6570413 0 1 0 0 0 0 1 1 1 ... 0 0 0 0 0 0 0 0 0 30
139132 6258895 0 0 0 0 0 0 1 1 1 ... 0 0 0 0 0 0 0 0 0 30
139138 6397274 0 1 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 1 0 40
139139 6007291 1 1 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 40
139145 6412619 0 0 0 1 0 0 1 0 1 ... 0 0 0 0 0 0 0 0 0 60

23191 rows × 21 columns

Missing values¶

In [ ]:
training_window.columns[training_window.isnull().any()].tolist()
Out[ ]:
['SavingAccount_Balance_Average', 'Region', 'CreditCard_Product']

Tratamiento missing: SavingAccount_Balance_Average¶

In [ ]:
training_window[training_window.SavingAccount_Balance_Average.fillna(-999) == -999]
Out[ ]:
client_id Target Month First_product_dt Last_product_dt CreditCard_Premium CreditCard_Active CreditCard_CoBranding Loan_Active Mortgage_Active ... CreditCard_Payment_Web CreditCard_Payment_ATM CreditCard_Payment_TAS Investment_Numbers Mobile Email Region CreditCard_Product cantidad_meses TGT
50026 4473325 0.0 2018-12-01 2008-05-27 2017-05-18 Yes Yes No No No ... 0.0 0.0 0.0 0.0 Yes Yes NaN NaN 9 0.0
50027 4837071 1.0 2018-12-01 2013-02-21 2018-09-04 No No No No No ... 0.0 0.0 0.0 0.0 Yes Yes NaN NaN 9 1.0
50029 4525957 1.0 2019-01-01 2012-10-16 2016-08-31 Yes Yes No No No ... 1.0 0.0 0.0 0.0 Yes Yes NaN NaN 9 1.0
139145 6412619 0.0 2019-01-01 2015-07-08 2018-06-02 No No No Yes No ... 0.0 0.0 0.0 0.0 Yes No NaN NaN 9 0.0

4 rows × 79 columns

In [ ]:
training_window[training_window.client_id == '6412619']['SavingAccount_Balance_Average']
Out[ ]:
Series([], Name: SavingAccount_Balance_Average, dtype: float64)
In [ ]:
training_window['SavingAccount_Balance_Average'] = np.where(training_window.SavingAccount_Balance_Average.isnull(),
    training_window.SavingAccount_Balance_FirstDate - training_window.SavingAccount_Debits_Amounts + training_window.SavingAccount_Credits_Amounts,
    training_window.SavingAccount_Balance_Average)
In [ ]:
training_window[training_window.client_id == '6412619']['SavingAccount_Balance_Average']
Out[ ]:
Series([], Name: SavingAccount_Balance_Average, dtype: float64)

Tratamiento missing: Region¶

In [ ]:
training_window[training_window.Region.fillna('Empty') == 'Empty']
Out[ ]:
client_id Target Month First_product_dt Last_product_dt CreditCard_Premium CreditCard_Active CreditCard_CoBranding Loan_Active Mortgage_Active ... CreditCard_Payment_Web CreditCard_Payment_ATM CreditCard_Payment_TAS Investment_Numbers Mobile Email Region CreditCard_Product cantidad_meses TGT
0 5856970 1.0 2018-10-01 2013-10-23 2019-01-10 No Yes No No No ... 0.0 0.0 0.0 1.0 Yes Yes NaN NaN 9 1.0
1 6371753 0.0 2018-09-01 2015-07-29 2018-06-02 No No No No No ... 0.0 0.0 0.0 0.0 Yes No NaN NaN 9 0.0
2 5928737 0.0 2019-01-01 2016-08-31 2018-12-27 No No No No No ... 0.0 0.0 0.0 0.0 Yes Yes NaN NaN 9 0.0
3 475064 0.0 2018-12-01 2014-07-13 2017-11-30 No Yes No No No ... 0.0 0.0 0.0 0.0 Yes Yes NaN NaN 9 0.0
4 3615172 0.0 2018-09-01 2017-12-27 2017-12-28 No No No No No ... 0.0 0.0 0.0 0.0 Yes No NaN NaN 9 0.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
139141 1673642 0.0 2018-11-01 2017-08-18 2017-09-26 No Yes No No No ... 1.0 0.0 0.0 0.0 No Yes NaN NaN 9 0.0
139142 6145735 1.0 2018-11-01 2014-10-26 2014-10-26 No Yes No No No ... 1.0 0.0 0.0 0.0 Yes Yes NaN NaN 9 1.0
139143 5638786 1.0 2018-11-01 2012-12-26 2017-03-08 No Yes No No No ... 0.0 0.0 1.0 0.0 Yes No NaN NaN 9 1.0
139144 3824781 0.0 2018-08-01 2014-11-27 2019-01-04 No No No No No ... 0.0 0.0 0.0 0.0 Yes Yes NaN NaN 9 0.0
139145 6412619 0.0 2019-01-01 2015-07-08 2018-06-02 No No No Yes No ... 0.0 0.0 0.0 0.0 Yes No NaN NaN 9 0.0

139146 rows × 79 columns

In [ ]:
data[data.Month == '2019-04-01']
Out[ ]:
client_id Target Month First_product_dt Last_product_dt CreditCard_Premium CreditCard_Active CreditCard_CoBranding Loan_Active Mortgage_Active ... CreditCard_Payment_External CreditCard_Payment_Cash CreditCard_Payment_Web CreditCard_Payment_ATM CreditCard_Payment_TAS Investment_Numbers Mobile Email Region CreditCard_Product
9 5775560 0.0 2019-04-01 2013-08-22 2014-08-01 No Yes No No No ... 0.0 0.0 0.0 0.0 0.0 0.0 Yes Yes REGION CENTRO J55660104XX012
13 5800470 0.0 2019-04-01 2013-08-23 2018-03-26 No Yes Yes No No ... 0.0 0.0 0.0 0.0 0.0 0.0 Yes Yes REGION CENTRO J55660123XX012
17 3540244 0.0 2019-04-01 2018-09-07 2018-09-07 No No No No No ... 0.0 0.0 0.0 0.0 0.0 0.0 No No REGION CENTRO NaN
43 6912865 0.0 2019-04-01 2017-08-17 2019-01-04 No No No No No ... 0.0 0.0 0.0 0.0 1.0 0.0 Yes Yes BUENOS AIRES NaN
47 6595044 0.0 2019-04-01 2016-01-14 2017-09-26 No Yes No No No ... 0.0 0.0 0.0 0.0 0.0 0.0 Yes No REGION CENTRO J55660104XX012
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
238564 5678167 1.0 2019-04-01 2013-02-26 2018-11-28 No Yes No No No ... 0.0 0.0 0.0 0.0 1.0 0.0 No No BUENOS AIRES J55660104XX012
238589 6948039 1.0 2019-04-01 2017-08-14 2019-01-10 No Yes No No No ... 0.0 0.0 0.0 0.0 0.0 0.0 Yes Yes REGION NORTE GRANDE ARGENTINO J55660104XX012
238590 1818546 1.0 2019-04-01 2013-12-10 2018-02-07 No Yes No No No ... 0.0 1.0 0.0 0.0 0.0 0.0 Yes Yes REGION CUYO J55660104XX012
238611 6377583 0.0 2019-04-01 2015-06-03 2019-01-04 No Yes No Yes No ... 0.0 0.0 0.0 0.0 0.0 0.0 Yes Yes BUENOS AIRES J55660104XX012
238613 5542402 0.0 2019-04-01 2012-09-13 2012-09-13 No Yes No No No ... 0.0 0.0 0.0 1.0 0.0 0.0 No Yes REGION NORTE GRANDE ARGENTINO J55660104XX012

26547 rows × 77 columns

In [ ]:
training_window.Region.value_counts()
Out[ ]:
Series([], Name: count, dtype: int64)
In [ ]:
training_window['Region'].describe()
Out[ ]:
count       0
unique      0
top       NaN
freq      NaN
Name: Region, dtype: object
In [ ]:
training_win_buffer = training_window.copy()

data_last_Moth = data[data.Month == '2019-04-01'][['client_id','Region']].copy()
data_last_Moth.rename(columns={'Region':'RegionUpdated'}, inplace=True)

training_win_buffer = training_win_buffer.merge(data_last_Moth, how='left', on='client_id')

training_window['Region'] = training_win_buffer['RegionUpdated']

training_window
Out[ ]:
client_id Target Month First_product_dt Last_product_dt CreditCard_Premium CreditCard_Active CreditCard_CoBranding Loan_Active Mortgage_Active ... CreditCard_Payment_Web CreditCard_Payment_ATM CreditCard_Payment_TAS Investment_Numbers Mobile Email Region CreditCard_Product cantidad_meses TGT
0 5856970 1.0 2018-10-01 2013-10-23 2019-01-10 No Yes No No No ... 0.0 0.0 0.0 1.0 Yes Yes AMBA Resto NaN 9 1.0
1 6371753 0.0 2018-09-01 2015-07-29 2018-06-02 No No No No No ... 0.0 0.0 0.0 0.0 Yes No REGION CENTRO NaN 9 0.0
2 5928737 0.0 2019-01-01 2016-08-31 2018-12-27 No No No No No ... 0.0 0.0 0.0 0.0 Yes Yes REGION NORTE GRANDE ARGENTINO NaN 9 0.0
3 475064 0.0 2018-12-01 2014-07-13 2017-11-30 No Yes No No No ... 0.0 0.0 0.0 0.0 Yes Yes REGION CUYO NaN 9 0.0
4 3615172 0.0 2018-09-01 2017-12-27 2017-12-28 No No No No No ... 0.0 0.0 0.0 0.0 Yes No REGION CENTRO NaN 9 0.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
139141 1673642 0.0 2018-11-01 2017-08-18 2017-09-26 No Yes No No No ... 1.0 0.0 0.0 0.0 No Yes BUENOS AIRES NaN 9 0.0
139142 6145735 1.0 2018-11-01 2014-10-26 2014-10-26 No Yes No No No ... 1.0 0.0 0.0 0.0 Yes Yes REGION PATAGONICA NaN 9 1.0
139143 5638786 1.0 2018-11-01 2012-12-26 2017-03-08 No Yes No No No ... 0.0 0.0 1.0 0.0 Yes No REGION NORTE GRANDE ARGENTINO NaN 9 1.0
139144 3824781 0.0 2018-08-01 2014-11-27 2019-01-04 No No No No No ... 0.0 0.0 0.0 0.0 Yes Yes BUENOS AIRES NaN 9 0.0
139145 6412619 0.0 2019-01-01 2015-07-08 2018-06-02 No No No Yes No ... 0.0 0.0 0.0 0.0 Yes No REGION PATAGONICA NaN 9 0.0

139146 rows × 79 columns

In [ ]:
training_window['Region'].describe()
Out[ ]:
count           139110
unique               7
top       BUENOS AIRES
freq             41904
Name: Region, dtype: object
In [ ]:
training_window.Region.value_counts()
Out[ ]:
Region
BUENOS AIRES                     41904
REGION CENTRO                    27474
REGION NORTE GRANDE ARGENTINO    22044
REGION PATAGONICA                14592
CABA Centro/Norte                12288
AMBA Resto                       10968
REGION CUYO                       9840
Name: count, dtype: int64

Missing values Region que no se pudieron completar¶

In [ ]:
training_window[training_window.Region.fillna('Empty') == 'Empty']
Out[ ]:
client_id Target Month First_product_dt Last_product_dt CreditCard_Premium CreditCard_Active CreditCard_CoBranding Loan_Active Mortgage_Active ... CreditCard_Payment_Web CreditCard_Payment_ATM CreditCard_Payment_TAS Investment_Numbers Mobile Email Region CreditCard_Product cantidad_meses TGT
5618 2181839 0.0 2018-08-01 2006-10-05 2006-10-05 No No No No No ... 0.0 0.0 0.0 0.0 No No NaN NaN 9 0.0
12257 921872 0.0 2018-09-01 2005-11-24 2005-11-24 No No No No No ... 0.0 0.0 0.0 0.0 No No NaN NaN 9 0.0
18700 2181839 0.0 2019-01-01 2006-10-05 2006-10-05 No No No No No ... 0.0 0.0 0.0 0.0 No No NaN NaN 9 0.0
20460 436555 0.0 2018-11-01 2005-02-09 2005-02-09 No No No No No ... 0.0 0.0 0.0 0.0 No No NaN NaN 9 0.0
25548 727081 0.0 2018-08-01 2005-07-18 2005-07-18 No No No No No ... 0.0 0.0 0.0 0.0 No No NaN NaN 9 0.0
37210 535416 0.0 2018-09-01 2005-04-13 2005-04-13 No No No No No ... 0.0 0.0 0.0 0.0 No No NaN NaN 9 0.0
37430 833913 0.0 2018-08-01 2005-08-31 2014-01-24 No No No No No ... 0.0 0.0 0.0 0.0 No No NaN NaN 9 0.0
44841 436555 0.0 2018-08-01 2005-02-09 2005-02-09 No No No No No ... 0.0 0.0 0.0 0.0 No No NaN NaN 9 0.0
53787 2181839 0.0 2018-09-01 2006-10-05 2006-10-05 No No No No No ... 0.0 0.0 0.0 0.0 No No NaN NaN 9 0.0
55285 921872 0.0 2018-11-01 2005-11-24 2005-11-24 No No No No No ... 0.0 0.0 0.0 0.0 No No NaN NaN 9 0.0
57045 436555 0.0 2018-10-01 2005-02-09 2005-02-09 No No No No No ... 0.0 0.0 0.0 0.0 No No NaN NaN 9 0.0
62988 921872 0.0 2018-12-01 2005-11-24 2005-11-24 No No No No No ... 0.0 0.0 0.0 0.0 No No NaN NaN 9 0.0
67184 833913 0.0 2019-01-01 2005-08-31 2014-01-24 No No No No No ... 0.0 0.0 0.0 0.0 No No NaN NaN 9 0.0
77610 727081 0.0 2018-09-01 2005-07-18 2005-07-18 No No No No No ... 0.0 0.0 0.0 0.0 No No NaN NaN 9 0.0
82529 2181839 0.0 2018-10-01 2006-10-05 2006-10-05 No No No No No ... 0.0 0.0 0.0 0.0 No No NaN NaN 9 0.0
85105 436555 0.0 2018-09-01 2005-02-09 2005-02-09 No No No No No ... 0.0 0.0 0.0 0.0 No No NaN NaN 9 0.0
87881 535416 0.0 2018-10-01 2005-04-13 2005-04-13 No No No No No ... 0.0 0.0 0.0 0.0 No No NaN NaN 9 0.0
89371 2181839 0.0 2018-12-01 2006-10-05 2006-10-05 No No No No No ... 0.0 0.0 0.0 0.0 No No NaN NaN 9 0.0
93734 833913 0.0 2018-11-01 2005-08-31 2014-01-24 No No No No No ... 0.0 0.0 0.0 0.0 No No NaN NaN 9 0.0
93761 833913 0.0 2018-10-01 2005-08-31 2014-01-24 No No No No No ... 0.0 0.0 0.0 0.0 No No NaN NaN 9 0.0
94231 2181839 0.0 2018-11-01 2006-10-05 2006-10-05 No No No No No ... 0.0 0.0 0.0 0.0 No No NaN NaN 9 0.0
97122 727081 0.0 2018-11-01 2005-07-18 2005-07-18 No No No No No ... 0.0 0.0 0.0 0.0 No No NaN NaN 9 0.0
99710 535416 0.0 2018-12-01 2005-04-13 2005-04-13 No No No No No ... 0.0 0.0 0.0 0.0 No No NaN NaN 9 0.0
101314 535416 0.0 2019-01-01 2005-04-13 2005-04-13 No No No No No ... 0.0 0.0 0.0 0.0 No No NaN NaN 9 0.0
101414 727081 0.0 2019-01-01 2005-07-18 2005-07-18 No No No No No ... 0.0 0.0 0.0 0.0 No No NaN NaN 9 0.0
103617 727081 0.0 2018-10-01 2005-07-18 2005-07-18 No No No No No ... 0.0 0.0 0.0 0.0 No No NaN NaN 9 0.0
105080 535416 0.0 2018-11-01 2005-04-13 2005-04-13 No No No No No ... 0.0 0.0 0.0 0.0 No No NaN NaN 9 0.0
108222 833913 0.0 2018-09-01 2005-08-31 2014-01-24 No No No No No ... 0.0 0.0 0.0 0.0 No No NaN NaN 9 0.0
109178 833913 0.0 2018-12-01 2005-08-31 2014-01-24 No No No No No ... 0.0 0.0 0.0 0.0 No No NaN NaN 9 0.0
118693 921872 0.0 2018-10-01 2005-11-24 2005-11-24 No No No No No ... 0.0 0.0 0.0 0.0 No No NaN NaN 9 0.0
120440 727081 0.0 2018-12-01 2005-07-18 2005-07-18 No No No No No ... 0.0 0.0 0.0 0.0 No No NaN NaN 9 0.0
122807 436555 0.0 2018-12-01 2005-02-09 2005-02-09 No No No No No ... 0.0 0.0 0.0 0.0 No No NaN NaN 9 0.0
130135 921872 0.0 2018-08-01 2005-11-24 2005-11-24 No No No No No ... 0.0 0.0 0.0 0.0 No No NaN NaN 9 0.0
132195 921872 0.0 2019-01-01 2005-11-24 2005-11-24 No No No No No ... 0.0 0.0 0.0 0.0 No No NaN NaN 9 0.0
135035 436555 0.0 2019-01-01 2005-02-09 2005-02-09 No No No No No ... 0.0 0.0 0.0 0.0 No No NaN NaN 9 0.0
139104 535416 0.0 2018-08-01 2005-04-13 2005-04-13 No No No No No ... 0.0 0.0 0.0 0.0 No No NaN NaN 9 0.0

36 rows × 79 columns

In [ ]:
data[(data.Month == '2019-04-01') & (data.client_id == '833913')]
Out[ ]:
client_id Target Month First_product_dt Last_product_dt CreditCard_Premium CreditCard_Active CreditCard_CoBranding Loan_Active Mortgage_Active ... CreditCard_Payment_External CreditCard_Payment_Cash CreditCard_Payment_Web CreditCard_Payment_ATM CreditCard_Payment_TAS Investment_Numbers Mobile Email Region CreditCard_Product

0 rows × 77 columns

In [ ]:
data[data.client_id == '833913']
Out[ ]:
client_id Target Month First_product_dt Last_product_dt CreditCard_Premium CreditCard_Active CreditCard_CoBranding Loan_Active Mortgage_Active ... CreditCard_Payment_External CreditCard_Payment_Cash CreditCard_Payment_Web CreditCard_Payment_ATM CreditCard_Payment_TAS Investment_Numbers Mobile Email Region CreditCard_Product

0 rows × 77 columns

In [ ]:
training_window['Region'] = np.where(training_window['Region'].isna(), 
                                                          'BUENOS AIRES',
                                                         training_window['Region'])
In [ ]:
training_window[training_window.Region.fillna('Empty') == 'Empty']
Out[ ]:
client_id Target Month First_product_dt Last_product_dt CreditCard_Premium CreditCard_Active CreditCard_CoBranding Loan_Active Mortgage_Active ... CreditCard_Payment_Web CreditCard_Payment_ATM CreditCard_Payment_TAS Investment_Numbers Mobile Email Region CreditCard_Product cantidad_meses TGT

0 rows × 79 columns

In [ ]:
training_window.Region.value_counts()
Out[ ]:
Region
BUENOS AIRES                     41940
REGION CENTRO                    27474
REGION NORTE GRANDE ARGENTINO    22044
REGION PATAGONICA                14592
CABA Centro/Norte                12288
AMBA Resto                       10968
REGION CUYO                       9840
Name: count, dtype: int64

Tratamiento missing: CreditCard_Product¶

In [ ]:
training_window.CreditCard_Product.value_counts()
Out[ ]:
Series([], Name: count, dtype: int64)
In [ ]:
training_window[training_window.CreditCard_Product.fillna('Empty') == 'Empty']
Out[ ]:
client_id Target Month First_product_dt Last_product_dt CreditCard_Premium CreditCard_Active CreditCard_CoBranding Loan_Active Mortgage_Active ... CreditCard_Payment_Web CreditCard_Payment_ATM CreditCard_Payment_TAS Investment_Numbers Mobile Email Region CreditCard_Product cantidad_meses TGT
0 5856970 1.0 2018-10-01 2013-10-23 2019-01-10 No Yes No No No ... 0.0 0.0 0.0 1.0 Yes Yes AMBA Resto NaN 9 1.0
1 6371753 0.0 2018-09-01 2015-07-29 2018-06-02 No No No No No ... 0.0 0.0 0.0 0.0 Yes No REGION CENTRO NaN 9 0.0
2 5928737 0.0 2019-01-01 2016-08-31 2018-12-27 No No No No No ... 0.0 0.0 0.0 0.0 Yes Yes REGION NORTE GRANDE ARGENTINO NaN 9 0.0
3 475064 0.0 2018-12-01 2014-07-13 2017-11-30 No Yes No No No ... 0.0 0.0 0.0 0.0 Yes Yes REGION CUYO NaN 9 0.0
4 3615172 0.0 2018-09-01 2017-12-27 2017-12-28 No No No No No ... 0.0 0.0 0.0 0.0 Yes No REGION CENTRO NaN 9 0.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
139141 1673642 0.0 2018-11-01 2017-08-18 2017-09-26 No Yes No No No ... 1.0 0.0 0.0 0.0 No Yes BUENOS AIRES NaN 9 0.0
139142 6145735 1.0 2018-11-01 2014-10-26 2014-10-26 No Yes No No No ... 1.0 0.0 0.0 0.0 Yes Yes REGION PATAGONICA NaN 9 1.0
139143 5638786 1.0 2018-11-01 2012-12-26 2017-03-08 No Yes No No No ... 0.0 0.0 1.0 0.0 Yes No REGION NORTE GRANDE ARGENTINO NaN 9 1.0
139144 3824781 0.0 2018-08-01 2014-11-27 2019-01-04 No No No No No ... 0.0 0.0 0.0 0.0 Yes Yes BUENOS AIRES NaN 9 0.0
139145 6412619 0.0 2019-01-01 2015-07-08 2018-06-02 No No No Yes No ... 0.0 0.0 0.0 0.0 Yes No REGION PATAGONICA NaN 9 0.0

139146 rows × 79 columns

In [ ]:
training_window['CreditCard_Product'].describe()
Out[ ]:
count       0
unique      0
top       NaN
freq      NaN
Name: CreditCard_Product, dtype: object
In [ ]:
CreditActive_clients = training_window[(training_window.CreditCard_Active == 'Yes') & (training_window.Month == '2019-01-01')]['client_id']

CreditCardProduct_buffer = data[(data.Month == '2019-04-01') & (data.client_id.isin(CreditActive_clients))][['client_id','CreditCard_Product']].copy()

CreditCardProduct_buffer.rename(columns={'CreditCard_Product':'CreditCard_ProductUpdated'}, inplace=True)

training_win_buffer2 = training_window.copy()

training_win_buffer2 = training_win_buffer2.merge(CreditCardProduct_buffer, how='left', on='client_id')

training_window['CreditCard_Product'] = training_win_buffer2['CreditCard_ProductUpdated']

training_window
Out[ ]:
client_id Target Month First_product_dt Last_product_dt CreditCard_Premium CreditCard_Active CreditCard_CoBranding Loan_Active Mortgage_Active ... CreditCard_Payment_Web CreditCard_Payment_ATM CreditCard_Payment_TAS Investment_Numbers Mobile Email Region CreditCard_Product cantidad_meses TGT
0 5856970 1.0 2018-10-01 2013-10-23 2019-01-10 No Yes No No No ... 0.0 0.0 0.0 1.0 Yes Yes AMBA Resto J55660202XX012 9 1.0
1 6371753 0.0 2018-09-01 2015-07-29 2018-06-02 No No No No No ... 0.0 0.0 0.0 0.0 Yes No REGION CENTRO NaN 9 0.0
2 5928737 0.0 2019-01-01 2016-08-31 2018-12-27 No No No No No ... 0.0 0.0 0.0 0.0 Yes Yes REGION NORTE GRANDE ARGENTINO NaN 9 0.0
3 475064 0.0 2018-12-01 2014-07-13 2017-11-30 No Yes No No No ... 0.0 0.0 0.0 0.0 Yes Yes REGION CUYO J55660202XX012 9 0.0
4 3615172 0.0 2018-09-01 2017-12-27 2017-12-28 No No No No No ... 0.0 0.0 0.0 0.0 Yes No REGION CENTRO NaN 9 0.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
139141 1673642 0.0 2018-11-01 2017-08-18 2017-09-26 No Yes No No No ... 1.0 0.0 0.0 0.0 No Yes BUENOS AIRES J55660104XX012 9 0.0
139142 6145735 1.0 2018-11-01 2014-10-26 2014-10-26 No Yes No No No ... 1.0 0.0 0.0 0.0 Yes Yes REGION PATAGONICA J55660202XX012 9 1.0
139143 5638786 1.0 2018-11-01 2012-12-26 2017-03-08 No Yes No No No ... 0.0 0.0 1.0 0.0 Yes No REGION NORTE GRANDE ARGENTINO J55660202XX012 9 1.0
139144 3824781 0.0 2018-08-01 2014-11-27 2019-01-04 No No No No No ... 0.0 0.0 0.0 0.0 Yes Yes BUENOS AIRES NaN 9 0.0
139145 6412619 0.0 2019-01-01 2015-07-08 2018-06-02 No No No Yes No ... 0.0 0.0 0.0 0.0 Yes No REGION PATAGONICA NaN 9 0.0

139146 rows × 79 columns

In [ ]:
training_window['CreditCard_Product'].describe()
Out[ ]:
count              88590
unique                 7
top       J55660104XX012
freq               49572
Name: CreditCard_Product, dtype: object
In [ ]:
training_window.CreditCard_Product.value_counts()
Out[ ]:
CreditCard_Product
J55660104XX012    49572
J55660202XX012    34554
J55660102XX012     2412
J55660702XX012     1494
J55661002XX012      372
J55660124XX012      180
J55660123XX012        6
Name: count, dtype: int64

Missing values CreditCard_Product que no se pudieron completar¶

In [ ]:
training_window[training_window.CreditCard_Product.fillna('Empty') == 'Empty']
Out[ ]:
client_id Target Month First_product_dt Last_product_dt CreditCard_Premium CreditCard_Active CreditCard_CoBranding Loan_Active Mortgage_Active ... CreditCard_Payment_Web CreditCard_Payment_ATM CreditCard_Payment_TAS Investment_Numbers Mobile Email Region CreditCard_Product cantidad_meses TGT
1 6371753 0.0 2018-09-01 2015-07-29 2018-06-02 No No No No No ... 0.0 0.0 0.0 0.0 Yes No REGION CENTRO NaN 9 0.0
2 5928737 0.0 2019-01-01 2016-08-31 2018-12-27 No No No No No ... 0.0 0.0 0.0 0.0 Yes Yes REGION NORTE GRANDE ARGENTINO NaN 9 0.0
4 3615172 0.0 2018-09-01 2017-12-27 2017-12-28 No No No No No ... 0.0 0.0 0.0 0.0 Yes No REGION CENTRO NaN 9 0.0
5 6412264 0.0 2018-09-01 2016-01-27 2019-01-03 No No No No No ... 0.0 0.0 0.0 0.0 Yes No REGION NORTE GRANDE ARGENTINO NaN 9 0.0
6 6318899 0.0 2018-10-01 2015-03-26 2018-01-31 No No No No No ... 0.0 0.0 0.0 0.0 Yes Yes REGION NORTE GRANDE ARGENTINO NaN 9 0.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
139122 5940877 0.0 2018-08-01 2014-04-08 2018-12-19 No No No No No ... 0.0 0.0 0.0 0.0 Yes Yes REGION NORTE GRANDE ARGENTINO NaN 9 0.0
139132 6258895 0.0 2019-01-01 2015-03-12 2018-03-02 No No No No No ... 0.0 0.0 0.0 0.0 Yes Yes BUENOS AIRES NaN 9 0.0
139135 6351091 0.0 2018-10-01 2015-05-06 2018-04-04 No No No No No ... 0.0 0.0 0.0 0.0 Yes Yes BUENOS AIRES NaN 9 0.0
139144 3824781 0.0 2018-08-01 2014-11-27 2019-01-04 No No No No No ... 0.0 0.0 0.0 0.0 Yes Yes BUENOS AIRES NaN 9 0.0
139145 6412619 0.0 2019-01-01 2015-07-08 2018-06-02 No No No Yes No ... 0.0 0.0 0.0 0.0 Yes No REGION PATAGONICA NaN 9 0.0

50556 rows × 79 columns

In [ ]:
data[data.client_id == '5967858']
Out[ ]:
client_id Target Month First_product_dt Last_product_dt CreditCard_Premium CreditCard_Active CreditCard_CoBranding Loan_Active Mortgage_Active ... CreditCard_Payment_External CreditCard_Payment_Cash CreditCard_Payment_Web CreditCard_Payment_ATM CreditCard_Payment_TAS Investment_Numbers Mobile Email Region CreditCard_Product

0 rows × 77 columns

In [ ]:
training_window['CreditCard_Product'] = np.where(training_window['CreditCard_Product'].isna(), 
                                                          'No',
                                                         training_window['CreditCard_Product'])
In [ ]:
training_window[training_window.CreditCard_Product.fillna('Empty') == 'Empty']
Out[ ]:
client_id Target Month First_product_dt Last_product_dt CreditCard_Premium CreditCard_Active CreditCard_CoBranding Loan_Active Mortgage_Active ... CreditCard_Payment_Web CreditCard_Payment_ATM CreditCard_Payment_TAS Investment_Numbers Mobile Email Region CreditCard_Product cantidad_meses TGT

0 rows × 79 columns

In [ ]:
training_window.CreditCard_Product.value_counts()
Out[ ]:
CreditCard_Product
No                50556
J55660104XX012    49572
J55660202XX012    34554
J55660102XX012     2412
J55660702XX012     1494
J55661002XX012      372
J55660124XX012      180
J55660123XX012        6
Name: count, dtype: int64
In [ ]:
training_window.columns[training_window.isnull().any()].tolist()
Out[ ]:
[]

Outliers¶

In [ ]:
training_window['SavingAccount_Days_with_use'].describe()
Out[ ]:
count    139146.000000
mean          2.512685
std           3.831400
min           0.000000
25%           0.000000
50%           0.000000
75%           4.000000
max          41.000000
Name: SavingAccount_Days_with_use, dtype: float64
In [ ]:
import seaborn as sns
import matplotlib.pyplot as plt

# plot a distribution plot
sns.displot(training_window['SavingAccount_Balance_Average'], kind='kde')

# display the plot
plt.show()
No description has been provided for this image
In [ ]:
plt.boxplot(training_window['SavingAccount_Balance_Average'])

# add labels for five number summary
plt.text(x = 1.1, y = training_window['SavingAccount_Balance_Average'].min(), s ='min')
plt.text(x = 1.1, y = training_window['SavingAccount_Balance_Average'].quantile(0.25), s ='Q1')
plt.text(x = 1.1, y = training_window['SavingAccount_Balance_Average'].median(), s ='meadian (Q2)')
plt.text(x = 1.1, y = training_window['SavingAccount_Balance_Average'].quantile(0.75), s ='Q3')
plt.text(x = 1.1, y = training_window['SavingAccount_Balance_Average'].max(), s ='max')


# add the graph title and axes labels
plt.title('Boxplot of Total Bill Amount')
plt.ylabel('Total bill')

# display the plot
plt.show()
No description has been provided for this image
In [ ]:
training_window['SavingAccount_Balance_Average'].describe()
Out[ ]:
count    1.391460e+05
mean     4.094871e+03
std      2.293939e+04
min     -5.665900e+02
25%      0.000000e+00
50%      1.032500e+01
75%      1.473138e+03
max      1.771201e+06
Name: SavingAccount_Balance_Average, dtype: float64
In [ ]:
import seaborn as sns
import matplotlib.pyplot as plt

# plot a distribution plot
sns.displot(training_window['SavingAccount_Balance_Average'], kind='kde')

# display the plot
plt.show()
No description has been provided for this image
In [ ]:
p95 = training_window['SavingAccount_Balance_Average'].quantile(0.95)
p99 = training_window['SavingAccount_Balance_Average'].quantile(0.99)
three_sigma = 3 * training_window['SavingAccount_Balance_Average'].std()

print('p95         ', p95)
print('p99         ', p99)
print('Three sigma ', three_sigma)
p95          17043.66
p99          70055.67999999813
Three sigma  68818.178586892
In [ ]:
training_window['SavingAccount_Balance_Average'] = np.where(training_window['SavingAccount_Balance_Average'] > three_sigma, 
                                                          three_sigma,
                                                         training_window['SavingAccount_Balance_Average'])

training_window['SavingAccount_Balance_Average'].describe()
Out[ ]:
count    139146.000000
mean       3179.457488
std        9821.897057
min        -566.590000
25%           0.000000
50%          10.325000
75%        1473.137500
max       68818.178587
Name: SavingAccount_Balance_Average, dtype: float64
In [ ]:
import seaborn as sns
import matplotlib.pyplot as plt

# plot a distribution plot
sns.displot(training_window['SavingAccount_Balance_Average'], kind='kde')

# display the plot
plt.show()
No description has been provided for this image
In [ ]:
training_window['SavingAccount_Balance_Average'] = np.where(training_window['SavingAccount_Balance_Average'] > p95, 
                                                          p95,
                                                         training_window['SavingAccount_Balance_Average'])

training_window['SavingAccount_Balance_Average'].describe()
Out[ ]:
count    139146.000000
mean       2026.112910
std        4351.634834
min        -566.590000
25%           0.000000
50%          10.325000
75%        1473.137500
max       17043.660000
Name: SavingAccount_Balance_Average, dtype: float64
In [ ]:
import seaborn as sns
import matplotlib.pyplot as plt

# plot a distribution plot
sns.displot(training_window['SavingAccount_Balance_Average'], kind='kde')

# display the plot
plt.show()
No description has been provided for this image

Analisis de Outliers de variables 1 al 30¶

In [ ]:
variables_analisis_outliers = ['SavingAccount_Days_with_Credits',
                               'SavingAccount_Days_with_Debits',
                               'SavingAccount_Salary_Payment_Transactions',
                               'SavingAccount_Transfer_In_Transactions',
                               'SavingAccount_ATM_Extraction_Transactions',
                               'SavingAccount_CreditCard_Payment_Transactions',
                               'SavingAccount_Transfer_Out_Transactions',
                               'SavingAccount_DebitCard_Spend_Transactions',
                               'SavingAccount_Transactions_Transactions',
                               'SavingAccount_Credits_Transactions',
                               'SavingAccount_Debits_Transactions',
                               'SavingAccount_Salary_Payment_Amount',
                               'SavingAccount_Transfer_In_Amount',
                               'SavingAccount_ATM_Extraction_Amount',
                               'SavingAccount_CreditCard_Payment_Amount',
                               'SavingAccount_Transfer_Out_Amount',
                               'SavingAccount_DebitCard_Spend_Amount',
                               'SavingAccount_Total_Amount',
                               'SavingAccount_Credits_Amounts',
                               'SavingAccount_Debits_Amounts',
                               'Operations_HomeBanking',
                               'Operations_Mobile',
                               'CreditCard_Balance_ARG',
                               'CreditCard_Balance_DOLLAR',
                               'CreditCard_Total_Limit',
                               'CreditCard_Total_Spending',
                               'CreditCard_Spending_1_Installment',
                               'CreditCard_Spending_CrossBoarder',
                               'CreditCard_Spending_Aut_Debits',
                               'CreditCard_Revolving']
In [ ]:
for variables_analizables in variables_analisis_outliers:
    print('************************************************************ ')
    print('Variable ', variables_analizables)
    print(' ')
    
    training_window[variables_analizables].describe()
    # plot a distribution plot
    sns.displot(training_window[variables_analizables], kind='kde')

    # display the plot
    plt.show()
    
    plt.boxplot(training_window[variables_analizables])

    # add labels for five number summary
    plt.text(x = 1.1, y = training_window[variables_analizables].min(), s ='min')
    plt.text(x = 1.1, y = training_window[variables_analizables].quantile(0.25), s ='Q1')
    plt.text(x = 1.1, y = training_window[variables_analizables].median(), s ='meadian (Q2)')
    plt.text(x = 1.1, y = training_window[variables_analizables].quantile(0.75), s ='Q3')
    plt.text(x = 1.1, y = training_window[variables_analizables].max(), s ='max')


    # add the graph title and axes labels
    plt.title('Boxplot of Total Bill Amount')
    plt.ylabel('Total bill')

    # display the plot
    plt.show()
    
    p95 = training_window[variables_analizables].quantile(0.95)
    p99 = training_window[variables_analizables].quantile(0.99)
    three_sigma = 3 * training_window[variables_analizables].std()

    print('p95         ', p95)
    print('p99         ', p99)
    print('Three sigma ', three_sigma)
    
    training_window[variables_analizables].describe()
    
************************************************************ 
Variable  SavingAccount_Days_with_Credits
 
No description has been provided for this image
No description has been provided for this image
p95          5.0
p99          8.0
Three sigma  5.3821836766669335
************************************************************ 
Variable  SavingAccount_Days_with_Debits
 
No description has been provided for this image
No description has been provided for this image
p95          9.0
p99          15.0
Three sigma  9.723007390387242
************************************************************ 
Variable  SavingAccount_Salary_Payment_Transactions
 
No description has been provided for this image
No description has been provided for this image
p95          1.0
p99          3.0
Three sigma  1.7306415659542151
************************************************************ 
Variable  SavingAccount_Transfer_In_Transactions
 
No description has been provided for this image
No description has been provided for this image
p95          2.0
p99          4.0
Three sigma  3.4065241487924736
************************************************************ 
Variable  SavingAccount_ATM_Extraction_Transactions
 
No description has been provided for this image
No description has been provided for this image
p95          4.0
p99          12.0
Three sigma  6.793744483785929
************************************************************ 
Variable  SavingAccount_CreditCard_Payment_Transactions
 
No description has been provided for this image
No description has been provided for this image
p95          2.0
p99          4.0
Three sigma  2.820305708750473
************************************************************ 
Variable  SavingAccount_Transfer_Out_Transactions
 
No description has been provided for this image
No description has been provided for this image
p95          0.0
p99          0.0
Three sigma  0.3047514185040877
************************************************************ 
Variable  SavingAccount_DebitCard_Spend_Transactions
 
No description has been provided for this image
No description has been provided for this image
p95          7.0
p99          22.0
Three sigma  12.865522801673926
************************************************************ 
Variable  SavingAccount_Transactions_Transactions
 
No description has been provided for this image
No description has been provided for this image
p95          24.0
p99          47.0
Three sigma  28.791754836314986
************************************************************ 
Variable  SavingAccount_Credits_Transactions
 
No description has been provided for this image
No description has been provided for this image
p95          6.0
p99          11.0
Three sigma  7.817126304056519
************************************************************ 
Variable  SavingAccount_Debits_Transactions
 
No description has been provided for this image
No description has been provided for this image
p95          19.0
p99          38.0
Three sigma  22.894184199079866
************************************************************ 
Variable  SavingAccount_Salary_Payment_Amount
 
No description has been provided for this image
No description has been provided for this image
p95          22890.7775
p99          58568.9505
Three sigma  50991.24006782155
************************************************************ 
Variable  SavingAccount_Transfer_In_Amount
 
No description has been provided for this image
No description has been provided for this image
p95          13000.0
p99          43000.0
Three sigma  46628.46407356864
************************************************************ 
Variable  SavingAccount_ATM_Extraction_Amount
 
No description has been provided for this image
No description has been provided for this image
p95          8600.0
p99          25400.0
Three sigma  15161.853538210948
************************************************************ 
Variable  SavingAccount_CreditCard_Payment_Amount
 
No description has been provided for this image
No description has been provided for this image
p95          11868.3125
p99          33823.049999999756
Three sigma  31971.212947940097
************************************************************ 
Variable  SavingAccount_Transfer_Out_Amount
 
No description has been provided for this image
No description has been provided for this image
p95          0.0
p99          21453.013999999937
Three sigma  47523.41379065669
************************************************************ 
Variable  SavingAccount_DebitCard_Spend_Amount
 
No description has been provided for this image
No description has been provided for this image
p95          6350.3125
p99          20862.84349999993
Three sigma  13168.561377041347
************************************************************ 
Variable  SavingAccount_Total_Amount
 
No description has been provided for this image
No description has been provided for this image
p95          109939.47
p99          527795.7179999923
Three sigma  536040.4438446803
************************************************************ 
Variable  SavingAccount_Credits_Amounts
 
No description has been provided for this image
No description has been provided for this image
p95          54835.455
p99          269663.80649999913
Three sigma  271204.21506443416
************************************************************ 
Variable  SavingAccount_Debits_Amounts
 
No description has been provided for this image
No description has been provided for this image
p95          54108.784999999996
p99          268277.130499996
Three sigma  269760.6696250981
************************************************************ 
Variable  Operations_HomeBanking
 
No description has been provided for this image
No description has been provided for this image
p95          7.0
p99          15.0
Three sigma  8.928026498997632
************************************************************ 
Variable  Operations_Mobile
 
No description has been provided for this image
No description has been provided for this image
p95          5.0
p99          14.0
Three sigma  7.72057758014084
************************************************************ 
Variable  CreditCard_Balance_ARG
 
No description has been provided for this image
No description has been provided for this image
p95          30815.11
p99          55138.89599999998
Three sigma  35962.00973574623
************************************************************ 
Variable  CreditCard_Balance_DOLLAR
 
No description has been provided for this image
No description has been provided for this image
p95          0.99
p99          132.93649999999963
Three sigma  200.20502558797813
************************************************************ 
Variable  CreditCard_Total_Limit
 
No description has been provided for this image
No description has been provided for this image
p95          160000.0
p99          280000.0
Three sigma  184838.11811993475
************************************************************ 
Variable  CreditCard_Total_Spending
 
No description has been provided for this image
No description has been provided for this image
p95          17131.6725
p99          33148.23549999999
Three sigma  21870.01520929554
************************************************************ 
Variable  CreditCard_Spending_1_Installment
 
No description has been provided for this image
No description has been provided for this image
p95          7739.59
p99          19681.75649999997
Three sigma  12897.110532905726
************************************************************ 
Variable  CreditCard_Spending_CrossBoarder
 
No description has been provided for this image
No description has been provided for this image
p95          0.99
p99          143.6204999999987
Three sigma  212.90138519851982
************************************************************ 
Variable  CreditCard_Spending_Aut_Debits
 
No description has been provided for this image
No description has been provided for this image
p95          5619.7275
p99          12510.912499999991
Three sigma  7806.871328247001
************************************************************ 
Variable  CreditCard_Revolving
 
No description has been provided for this image
No description has been provided for this image
p95          17291.8975
p99          36175.25699999951
Three sigma  23840.512630865414

quitando outliers Variable: SavingAccount_Days_with_Credits¶

In [ ]:
p95 = training_window['SavingAccount_Days_with_Credits'].quantile(0.95)
p99 = training_window['SavingAccount_Days_with_Credits'].quantile(0.99)
three_sigma = 3 * training_window['SavingAccount_Days_with_Credits'].std()

print('p95         ', p95)
print('p99         ', p99)
print('Three sigma ', three_sigma)
p95          5.0
p99          8.0
Three sigma  5.3821836766669335
In [ ]:
training_window['SavingAccount_Days_with_Credits'] = np.where(training_window['SavingAccount_Days_with_Credits'] > p99, 
                                                          p99,
                                                         training_window['SavingAccount_Days_with_Credits'])

training_window['SavingAccount_Days_with_Credits'].describe()
Out[ ]:
count    139146.000000
mean          1.232504
std           1.691454
min           0.000000
25%           0.000000
50%           0.000000
75%           2.000000
max           8.000000
Name: SavingAccount_Days_with_Credits, dtype: float64
In [ ]:
# plot a distribution plot
sns.displot(training_window['SavingAccount_Days_with_Credits'], kind='kde')

# display the plot
plt.show()
No description has been provided for this image

quitando outliers Variable: SavingAccount_Days_with_Debits¶

In [ ]:
p95 = training_window['SavingAccount_Days_with_Debits'].quantile(0.95)
p99 = training_window['SavingAccount_Days_with_Debits'].quantile(0.99)
three_sigma = 3 * training_window['SavingAccount_Days_with_Debits'].std()

print('p95         ', p95)
print('p99         ', p99)
print('Three sigma ', three_sigma)
p95          9.0
p99          15.0
Three sigma  9.723007390387242
In [ ]:
training_window['SavingAccount_Days_with_Debits'] = np.where(training_window['SavingAccount_Days_with_Debits'] > three_sigma, 
                                                          three_sigma,
                                                         training_window['SavingAccount_Days_with_Debits'])

training_window['SavingAccount_Days_with_Debits'].describe()
Out[ ]:
count    139146.000000
mean          1.690302
std           2.649375
min           0.000000
25%           0.000000
50%           0.000000
75%           2.000000
max           9.723007
Name: SavingAccount_Days_with_Debits, dtype: float64
In [ ]:
# plot a distribution plot
sns.displot(training_window['SavingAccount_Days_with_Debits'], kind='kde')

# display the plot
plt.show()
No description has been provided for this image

quitando outliers Variable: SavingAccount_Salary_Payment_Transactions¶

In [ ]:
p95 = training_window['SavingAccount_Salary_Payment_Transactions'].quantile(0.95)
p99 = training_window['SavingAccount_Salary_Payment_Transactions'].quantile(0.99)
three_sigma = 3 * training_window['SavingAccount_Salary_Payment_Transactions'].std()

print('p95         ', p95)
print('p99         ', p99)
print('Three sigma ', three_sigma)
p95          1.0
p99          3.0
Three sigma  1.7306415659542151
In [ ]:
training_window['SavingAccount_Salary_Payment_Transactions'] = np.where(training_window['SavingAccount_Salary_Payment_Transactions'] > p99, 
                                                          p99,
                                                         training_window['SavingAccount_Salary_Payment_Transactions'])

training_window['SavingAccount_Salary_Payment_Transactions'].describe()
Out[ ]:
count    139146.000000
mean          0.142534
std           0.520272
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max           3.000000
Name: SavingAccount_Salary_Payment_Transactions, dtype: float64
In [ ]:
# plot a distribution plot
sns.displot(training_window['SavingAccount_Days_with_Debits'], kind='kde')

# display the plot
plt.show()
No description has been provided for this image

quitando outliers Variable: SavingAccount_Transfer_In_Transactions¶

In [ ]:
p95 = training_window['SavingAccount_Transfer_In_Transactions'].quantile(0.95)
p99 = training_window['SavingAccount_Transfer_In_Transactions'].quantile(0.99)
three_sigma = 3 * training_window['SavingAccount_Transfer_In_Transactions'].std()

print('p95         ', p95)
print('p99         ', p99)
print('Three sigma ', three_sigma)
p95          2.0
p99          4.0
Three sigma  3.4065241487924736
In [ ]:
training_window['SavingAccount_Transfer_In_Transactions'] = np.where(training_window['SavingAccount_Transfer_In_Transactions'] > p99, 
                                                          p99,
                                                         training_window['SavingAccount_Transfer_In_Transactions'])

training_window['SavingAccount_Transfer_In_Transactions'].describe()
Out[ ]:
count    139146.000000
mean          0.314993
std           0.750827
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max           4.000000
Name: SavingAccount_Transfer_In_Transactions, dtype: float64
In [ ]:
# plot a distribution plot
sns.displot(training_window['SavingAccount_Transfer_In_Transactions'], kind='kde')

# display the plot
plt.show()
No description has been provided for this image

quitando outliers Variable: SavingAccount_ATM_Extraction_Transactions¶

In [ ]:
p95 = training_window['SavingAccount_ATM_Extraction_Transactions'].quantile(0.95)
p99 = training_window['SavingAccount_ATM_Extraction_Transactions'].quantile(0.99)
three_sigma = 3 * training_window['SavingAccount_ATM_Extraction_Transactions'].std()

print('p95         ', p95)
print('p99         ', p99)
print('Three sigma ', three_sigma)
p95          4.0
p99          12.0
Three sigma  6.793744483785929
In [ ]:
training_window['SavingAccount_ATM_Extraction_Transactions'] = np.where(training_window['SavingAccount_ATM_Extraction_Transactions'] > p95, 
                                                          p95,
                                                         training_window['SavingAccount_ATM_Extraction_Transactions'])

training_window['SavingAccount_ATM_Extraction_Transactions'].describe()
Out[ ]:
count    139146.000000
mean          0.320505
std           1.006634
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max           4.000000
Name: SavingAccount_ATM_Extraction_Transactions, dtype: float64
In [ ]:
# plot a distribution plot
sns.displot(training_window['SavingAccount_ATM_Extraction_Transactions'], kind='kde')

# display the plot
plt.show()
No description has been provided for this image

quitando outliers Variable: SavingAccount_CreditCard_Payment_Transactions¶

In [ ]:
p95 = training_window['SavingAccount_CreditCard_Payment_Transactions'].quantile(0.95)
p99 = training_window['SavingAccount_CreditCard_Payment_Transactions'].quantile(0.99)
three_sigma = 3 * training_window['SavingAccount_CreditCard_Payment_Transactions'].std()

print('p95         ', p95)
print('p99         ', p99)
print('Three sigma ', three_sigma)
p95          2.0
p99          4.0
Three sigma  2.820305708750473
In [ ]:
training_window['SavingAccount_CreditCard_Payment_Transactions'] = np.where(training_window['SavingAccount_CreditCard_Payment_Transactions'] > p99, 
                                                          p99,
                                                         training_window['SavingAccount_CreditCard_Payment_Transactions'])

training_window['SavingAccount_CreditCard_Payment_Transactions'].describe()
Out[ ]:
count    139146.000000
mean          0.359170
std           0.767203
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max           4.000000
Name: SavingAccount_CreditCard_Payment_Transactions, dtype: float64
In [ ]:
# plot a distribution plot
sns.displot(training_window['SavingAccount_ATM_Extraction_Transactions'], kind='kde')

# display the plot
plt.show()
No description has been provided for this image

quitando outliers Variable: SavingAccount_Transfer_Out_Transactions¶

In [ ]:
p95 = training_window['SavingAccount_Transfer_Out_Transactions'].quantile(0.95)
p99 = training_window['SavingAccount_Transfer_Out_Transactions'].quantile(0.99)
three_sigma = 3 * training_window['SavingAccount_Transfer_Out_Transactions'].std()

print('p95         ', p95)
print('p99         ', p99)
print('Three sigma ', three_sigma)
p95          0.0
p99          0.0
Three sigma  0.3047514185040877
In [ ]:
training_window.SavingAccount_Transfer_Out_Transactions.value_counts()
Out[ ]:
SavingAccount_Transfer_Out_Transactions
0.0     139099
1.0         37
2.0          3
10.0         1
9.0          1
15.0         1
25.0         1
14.0         1
4.0          1
12.0         1
Name: count, dtype: int64
In [ ]:
# revisar, preguntar sobre esta variable: SavingAccount_Transfer_Out_Transactions

quitando outliers Variable: SavingAccount_DebitCard_Spend_Transactions¶

In [ ]:
p95 = training_window['SavingAccount_DebitCard_Spend_Transactions'].quantile(0.95)
p99 = training_window['SavingAccount_DebitCard_Spend_Transactions'].quantile(0.99)
three_sigma = 3 * training_window['SavingAccount_DebitCard_Spend_Transactions'].std()

print('p95         ', p95)
print('p99         ', p99)
print('Three sigma ', three_sigma)
p95          7.0
p99          22.0
Three sigma  12.865522801673926
In [ ]:
training_window['SavingAccount_DebitCard_Spend_Transactions'] = np.where(training_window['SavingAccount_DebitCard_Spend_Transactions'] > p99, 
                                                          p99,
                                                         training_window['SavingAccount_DebitCard_Spend_Transactions'])

training_window['SavingAccount_DebitCard_Spend_Transactions'].describe()
Out[ ]:
count    139146.000000
mean          0.991390
std           3.528173
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max          22.000000
Name: SavingAccount_DebitCard_Spend_Transactions, dtype: float64
In [ ]:
# plot a distribution plot
sns.displot(training_window['SavingAccount_DebitCard_Spend_Transactions'], kind='kde')

# display the plot
plt.show()
No description has been provided for this image

quitando outliers Variable: SavingAccount_Transactions_Transactions¶

In [ ]:
p95 = training_window['SavingAccount_Transactions_Transactions'].quantile(0.95)
p99 = training_window['SavingAccount_Transactions_Transactions'].quantile(0.99)
three_sigma = 3 * training_window['SavingAccount_Transactions_Transactions'].std()

print('p95         ', p95)
print('p99         ', p99)
print('Three sigma ', three_sigma)
p95          24.0
p99          47.0
Three sigma  28.791754836314986
In [ ]:
training_window['SavingAccount_Transactions_Transactions'] = np.where(training_window['SavingAccount_Transactions_Transactions'] > p99, 
                                                          p99,
                                                         training_window['SavingAccount_Transactions_Transactions'])

training_window['SavingAccount_Transactions_Transactions'].describe()
Out[ ]:
count    139146.000000
mean          4.765656
std           8.710314
min           0.000000
25%           0.000000
50%           0.000000
75%           5.000000
max          47.000000
Name: SavingAccount_Transactions_Transactions, dtype: float64
In [ ]:
# plot a distribution plot
sns.displot(training_window['SavingAccount_Transactions_Transactions'], kind='kde')

# display the plot
plt.show()
No description has been provided for this image

quitando outliers Variable: SavingAccount_Credits_Transactions¶

In [ ]:
p95 = training_window['SavingAccount_Credits_Transactions'].quantile(0.95)
p99 = training_window['SavingAccount_Credits_Transactions'].quantile(0.99)
three_sigma = 3 * training_window['SavingAccount_Credits_Transactions'].std()

print('p95         ', p95)
print('p99         ', p99)
print('Three sigma ', three_sigma)
p95          6.0
p99          11.0
Three sigma  7.817126304056519
In [ ]:
training_window['SavingAccount_Credits_Transactions'] = np.where(training_window['SavingAccount_Credits_Transactions'] > p99, 
                                                          p99,
                                                         training_window['SavingAccount_Credits_Transactions'])

training_window['SavingAccount_Credits_Transactions'].describe()
Out[ ]:
count    139146.000000
mean          1.509573
std           2.220993
min           0.000000
25%           0.000000
50%           0.000000
75%           2.000000
max          11.000000
Name: SavingAccount_Credits_Transactions, dtype: float64
In [ ]:
# plot a distribution plot
sns.displot(training_window['SavingAccount_Credits_Transactions'], kind='kde')

# display the plot
plt.show()
No description has been provided for this image

quitando outliers Variable: SavingAccount_Debits_Transactions¶

In [ ]:
p95 = training_window['SavingAccount_Debits_Transactions'].quantile(0.95)
p99 = training_window['SavingAccount_Debits_Transactions'].quantile(0.99)
three_sigma = 3 * training_window['SavingAccount_Debits_Transactions'].std()

print('p95         ', p95)
print('p99         ', p99)
print('Three sigma ', three_sigma)
p95          19.0
p99          38.0
Three sigma  22.894184199079866
In [ ]:
training_window['SavingAccount_Debits_Transactions'] = np.where(training_window['SavingAccount_Debits_Transactions'] > p99, 
                                                          p99,
                                                         training_window['SavingAccount_Debits_Transactions'])

training_window['SavingAccount_Debits_Transactions'].describe()
Out[ ]:
count    139146.000000
mean          3.222780
std           6.824929
min           0.000000
25%           0.000000
50%           0.000000
75%           3.000000
max          38.000000
Name: SavingAccount_Debits_Transactions, dtype: float64
In [ ]:
# plot a distribution plot
sns.displot(training_window['SavingAccount_Debits_Transactions'], kind='kde')

# display the plot
plt.show()
No description has been provided for this image

quitando outliers Variable: SavingAccount_Salary_Payment_Amount¶

In [ ]:
p95 = training_window['SavingAccount_Salary_Payment_Amount'].quantile(0.95)
p99 = training_window['SavingAccount_Salary_Payment_Amount'].quantile(0.99)
three_sigma = 3 * training_window['SavingAccount_Salary_Payment_Amount'].std()

print('p95         ', p95)
print('p99         ', p99)
print('Three sigma ', three_sigma)
p95          22890.7775
p99          58568.9505
Three sigma  50991.24006782155
In [ ]:
training_window['SavingAccount_Salary_Payment_Amount'] = np.where(training_window['SavingAccount_Salary_Payment_Amount'] > p95, 
                                                          p95,
                                                         training_window['SavingAccount_Salary_Payment_Amount'])

training_window['SavingAccount_Salary_Payment_Amount'].describe()
Out[ ]:
count    139146.000000
mean       1621.769465
std        5617.856081
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max       22890.777500
Name: SavingAccount_Salary_Payment_Amount, dtype: float64
In [ ]:
# plot a distribution plot
sns.displot(training_window['SavingAccount_Salary_Payment_Amount'], kind='kde')

# display the plot
plt.show()
No description has been provided for this image

quitando outliers Variable: SavingAccount_Transfer_In_Amount¶

In [ ]:
p95 = training_window['SavingAccount_Transfer_In_Amount'].quantile(0.95)
p99 = training_window['SavingAccount_Transfer_In_Amount'].quantile(0.99)
three_sigma = 3 * training_window['SavingAccount_Transfer_In_Amount'].std()

print('p95         ', p95)
print('p99         ', p99)
print('Three sigma ', three_sigma)
p95          13000.0
p99          43000.0
Three sigma  46628.46407356864
In [ ]:
training_window['SavingAccount_Transfer_In_Amount'] = np.where(training_window['SavingAccount_Transfer_In_Amount'] > p95, 
                                                          p95,
                                                         training_window['SavingAccount_Transfer_In_Amount'])

training_window['SavingAccount_Transfer_In_Amount'].describe()
Out[ ]:
count    139146.000000
mean       1420.367686
std        3444.238763
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max       13000.000000
Name: SavingAccount_Transfer_In_Amount, dtype: float64
In [ ]:
# plot a distribution plot
sns.displot(training_window['SavingAccount_Transfer_In_Amount'], kind='kde')

# display the plot
plt.show()
No description has been provided for this image

quitando outliers Variable: SavingAccount_ATM_Extraction_Amount¶

In [ ]:
p95 = training_window['SavingAccount_ATM_Extraction_Amount'].quantile(0.95)
p99 = training_window['SavingAccount_ATM_Extraction_Amount'].quantile(0.99)
three_sigma = 3 * training_window['SavingAccount_ATM_Extraction_Amount'].std()

print('p95         ', p95)
print('p99         ', p99)
print('Three sigma ', three_sigma)
p95          8600.0
p99          25400.0
Three sigma  15161.853538210948
In [ ]:
training_window['SavingAccount_ATM_Extraction_Amount'] = np.where(training_window['SavingAccount_ATM_Extraction_Amount'] > p95, 
                                                          p95,
                                                         training_window['SavingAccount_ATM_Extraction_Amount'])

training_window['SavingAccount_ATM_Extraction_Amount'].describe()
Out[ ]:
count    139146.000000
mean        642.253338
std        2102.709382
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max        8600.000000
Name: SavingAccount_ATM_Extraction_Amount, dtype: float64
In [ ]:
# plot a distribution plot
sns.displot(training_window['SavingAccount_ATM_Extraction_Amount'], kind='kde')

# display the plot
plt.show()
No description has been provided for this image

quitando outliers Variable: SavingAccount_CreditCard_Payment_Amount¶

In [ ]:
p95 = training_window['SavingAccount_CreditCard_Payment_Amount'].quantile(0.95)
p99 = training_window['SavingAccount_CreditCard_Payment_Amount'].quantile(0.99)
three_sigma = 3 * training_window['SavingAccount_CreditCard_Payment_Amount'].std()

print('p95         ', p95)
print('p99         ', p99)
print('Three sigma ', three_sigma)
p95          11868.3125
p99          33823.049999999756
Three sigma  31971.212947940097
In [ ]:
training_window['SavingAccount_CreditCard_Payment_Amount'] = np.where(training_window['SavingAccount_CreditCard_Payment_Amount'] > three_sigma, 
                                                          three_sigma,
                                                         training_window['SavingAccount_CreditCard_Payment_Amount'])

training_window['SavingAccount_CreditCard_Payment_Amount'].describe()
Out[ ]:
count    139146.000000
mean       1840.757643
std        5226.661125
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max       31971.212948
Name: SavingAccount_CreditCard_Payment_Amount, dtype: float64
In [ ]:
# plot a distribution plot
sns.displot(training_window['SavingAccount_CreditCard_Payment_Amount'], kind='kde')

# display the plot
plt.show()
No description has been provided for this image

quitando outliers Variable: SavingAccount_Transfer_Out_Amount¶

In [ ]:
p95 = training_window['SavingAccount_Transfer_Out_Amount'].quantile(0.95)
p99 = training_window['SavingAccount_Transfer_Out_Amount'].quantile(0.99)
three_sigma = 3 * training_window['SavingAccount_Transfer_Out_Amount'].std()

print('p95         ', p95)
print('p99         ', p99)
print('Three sigma ', three_sigma)
p95          0.0
p99          21453.013999999937
Three sigma  47523.41379065669
In [ ]:
training_window['SavingAccount_Transfer_Out_Amount'] = np.where(training_window['SavingAccount_Transfer_Out_Amount'] > p99, 
                                                          p99,
                                                         training_window['SavingAccount_Transfer_Out_Amount'])

training_window['SavingAccount_Transfer_Out_Amount'].describe()
Out[ ]:
count    139146.000000
mean        451.299070
std        2652.509583
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max       21453.014000
Name: SavingAccount_Transfer_Out_Amount, dtype: float64
In [ ]:
# plot a distribution plot
sns.displot(training_window['SavingAccount_Transfer_Out_Amount'], kind='kde')

# display the plot
plt.show()
No description has been provided for this image

quitando outliers Variable: SavingAccount_DebitCard_Spend_Amount¶

In [ ]:
p95 = training_window['SavingAccount_DebitCard_Spend_Amount'].quantile(0.95)
p99 = training_window['SavingAccount_DebitCard_Spend_Amount'].quantile(0.99)
three_sigma = 3 * training_window['SavingAccount_DebitCard_Spend_Amount'].std()

print('p95         ', p95)
print('p99         ', p99)
print('Three sigma ', three_sigma)
p95          6350.3125
p99          20862.84349999993
Three sigma  13168.561377041347
In [ ]:
training_window['SavingAccount_DebitCard_Spend_Amount'] = np.where(training_window['SavingAccount_DebitCard_Spend_Amount'] > p95, 
                                                          p95,
                                                         training_window['SavingAccount_DebitCard_Spend_Amount'])

training_window['SavingAccount_DebitCard_Spend_Amount'].describe()
Out[ ]:
count    139146.000000
mean        504.030718
std        1567.015956
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max        6350.312500
Name: SavingAccount_DebitCard_Spend_Amount, dtype: float64
In [ ]:
# plot a distribution plot
sns.displot(training_window['SavingAccount_DebitCard_Spend_Amount'], kind='kde')

# display the plot
plt.show()
No description has been provided for this image

quitando outliers Variable: SavingAccount_Total_Amount¶

In [ ]:
p95 = training_window['SavingAccount_Total_Amount'].quantile(0.95)
p99 = training_window['SavingAccount_Total_Amount'].quantile(0.99)
three_sigma = 3 * training_window['SavingAccount_Total_Amount'].std()

print('p95         ', p95)
print('p99         ', p99)
print('Three sigma ', three_sigma)
p95          109939.47
p99          527795.7179999923
Three sigma  536040.4438446803
In [ ]:
training_window['SavingAccount_Total_Amount'] = np.where(training_window['SavingAccount_Total_Amount'] > p95, 
                                                          p95,
                                                         training_window['SavingAccount_Total_Amount'])

training_window['SavingAccount_Total_Amount'].describe()
Out[ ]:
count    139146.000000
mean      15375.232616
std       29201.571404
min           0.000000
25%           0.000000
50%           0.000000
75%       14466.357500
max      109939.470000
Name: SavingAccount_Total_Amount, dtype: float64
In [ ]:
# plot a distribution plot
sns.displot(training_window['SavingAccount_Total_Amount'], kind='kde')

# display the plot
plt.show()
No description has been provided for this image

quitando outliers Variable: SavingAccount_Credits_Amounts¶

In [ ]:
p95 = training_window['SavingAccount_Credits_Amounts'].quantile(0.95)
p99 = training_window['SavingAccount_Credits_Amounts'].quantile(0.99)
three_sigma = 3 * training_window['SavingAccount_Credits_Amounts'].std()

print('p95         ', p95)
print('p99         ', p99)
print('Three sigma ', three_sigma)
p95          54835.455
p99          269663.80649999913
Three sigma  271204.21506443416
In [ ]:
training_window['SavingAccount_Credits_Amounts'] = np.where(training_window['SavingAccount_Credits_Amounts'] > p95, 
                                                          p95,
                                                         training_window['SavingAccount_Credits_Amounts'])

training_window['SavingAccount_Credits_Amounts'].describe()
Out[ ]:
count    139146.000000
mean       7558.049240
std       14577.499232
min           0.000000
25%           0.000000
50%           0.000000
75%        7000.250000
max       54835.455000
Name: SavingAccount_Credits_Amounts, dtype: float64
In [ ]:
# plot a distribution plot
sns.displot(training_window['SavingAccount_Credits_Amounts'], kind='kde')

# display the plot
plt.show()
No description has been provided for this image
In [ ]:
## quitando outliers Variable: SavingAccount_Debits_Amounts
In [ ]:
p95 = training_window['SavingAccount_Debits_Amounts'].quantile(0.95)
p99 = training_window['SavingAccount_Debits_Amounts'].quantile(0.99)
three_sigma = 3 * training_window['SavingAccount_Debits_Amounts'].std()

print('p95         ', p95)
print('p99         ', p99)
print('Three sigma ', three_sigma)
p95          54108.784999999996
p99          268277.130499996
Three sigma  269760.6696250981
In [ ]:
training_window['SavingAccount_Debits_Amounts'] = np.where(training_window['SavingAccount_Debits_Amounts'] > p95, 
                                                          p95,
                                                         training_window['SavingAccount_Debits_Amounts'])

training_window['SavingAccount_Debits_Amounts'].describe()
Out[ ]:
count    139146.000000
mean       7539.658027
std       14341.236934
min           0.000000
25%           0.000000
50%           0.000000
75%        7102.057500
max       54108.785000
Name: SavingAccount_Debits_Amounts, dtype: float64
In [ ]:
# plot a distribution plot
sns.displot(training_window['SavingAccount_Debits_Amounts'], kind='kde')

# display the plot
plt.show()
No description has been provided for this image

quitando outliers Variable: Operations_HomeBanking¶

In [ ]:
p95 = training_window['Operations_HomeBanking'].quantile(0.95)
p99 = training_window['Operations_HomeBanking'].quantile(0.99)
three_sigma = 3 * training_window['Operations_HomeBanking'].std()

print('p95         ', p95)
print('p99         ', p99)
print('Three sigma ', three_sigma)
p95          7.0
p99          15.0
Three sigma  8.928026498997632
In [ ]:
training_window['Operations_HomeBanking'] = np.where(training_window['Operations_HomeBanking'] > p99, 
                                                          p99,
                                                         training_window['Operations_HomeBanking'])

training_window['Operations_HomeBanking'].describe()
Out[ ]:
count    139146.000000
mean          1.213186
std           2.761946
min           0.000000
25%           0.000000
50%           0.000000
75%           1.000000
max          15.000000
Name: Operations_HomeBanking, dtype: float64
In [ ]:
# plot a distribution plot
sns.displot(training_window['Operations_HomeBanking'], kind='kde')

# display the plot
plt.show()
No description has been provided for this image

quitando outliers Variable: Operations_Mobile¶

In [ ]:
p95 = training_window['Operations_Mobile'].quantile(0.95)
p99 = training_window['Operations_Mobile'].quantile(0.99)
three_sigma = 3 * training_window['Operations_Mobile'].std()

print('p95         ', p95)
print('p99         ', p99)
print('Three sigma ', three_sigma)
p95          5.0
p99          14.0
Three sigma  7.72057758014084
In [ ]:
training_window['Operations_Mobile'] = np.where(training_window['Operations_Mobile'] > p99, 
                                                          p99,
                                                         training_window['Operations_Mobile'])

training_window['Operations_Mobile'].describe()
Out[ ]:
count    139146.000000
mean          0.624617
std           2.277957
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max          14.000000
Name: Operations_Mobile, dtype: float64
In [ ]:
# plot a distribution plot
sns.displot(training_window['Operations_Mobile'], kind='kde')

# display the plot
plt.show()
No description has been provided for this image

quitando outliers Variable: CreditCard_Balance_ARG¶

In [ ]:
p95 = training_window['CreditCard_Balance_ARG'].quantile(0.95)
p99 = training_window['CreditCard_Balance_ARG'].quantile(0.99)
np99 = -training_window['CreditCard_Balance_ARG'].quantile(0.99)
three_sigma = 3 * training_window['CreditCard_Balance_ARG'].std()

print('p95         ', p95)
print('p99         ', p99)
print('np99         ', np99)
print('Three sigma ', three_sigma)
p95          30815.11
p99          55138.89599999998
np99          -55138.89599999998
Three sigma  35962.00973574623
In [ ]:
training_window['CreditCard_Balance_ARG'] = np.where(training_window['CreditCard_Balance_ARG'] > p99, 
                                                          p99,
                                                         training_window['CreditCard_Balance_ARG'])

training_window['CreditCard_Balance_ARG'] = np.where(training_window['CreditCard_Balance_ARG'] < np99, 
                                                          np99,
                                                         training_window['CreditCard_Balance_ARG'])

training_window['CreditCard_Balance_ARG'].describe()
Out[ ]:
count    139146.000000
mean       7084.210404
std       10832.826535
min      -55138.896000
25%           0.000000
50%        2716.210000
75%        9483.642500
max       55138.896000
Name: CreditCard_Balance_ARG, dtype: float64
In [ ]:
# plot a distribution plot
sns.displot(training_window['CreditCard_Balance_ARG'], kind='kde')

# display the plot
plt.show()
No description has been provided for this image

quitando outliers Variable: CreditCard_Balance_DOLLAR¶

In [ ]:
p95 = training_window['CreditCard_Balance_DOLLAR'].quantile(0.95)
p99 = training_window['CreditCard_Balance_DOLLAR'].quantile(0.99)
np99 = -training_window['CreditCard_Balance_DOLLAR'].quantile(0.99)
three_sigma = 3 * training_window['CreditCard_Balance_DOLLAR'].std()

print('p95         ', p95)
print('p99         ', p99)
print('np99         ', np99)
print('Three sigma ', three_sigma)
p95          0.99
p99          132.93649999999963
np99          -132.93649999999963
Three sigma  200.20502558797813
In [ ]:
training_window['CreditCard_Balance_DOLLAR'] = np.where(training_window['CreditCard_Balance_DOLLAR'] > p99, 
                                                          p99,
                                                         training_window['CreditCard_Balance_DOLLAR'])

training_window['CreditCard_Balance_DOLLAR'] = np.where(training_window['CreditCard_Balance_DOLLAR'] < np99, 
                                                          np99,
                                                         training_window['CreditCard_Balance_DOLLAR'])

training_window['CreditCard_Balance_DOLLAR'].describe()
Out[ ]:
count    139146.000000
mean          2.161760
std          15.995588
min        -132.936500
25%           0.000000
50%           0.000000
75%           0.000000
max         132.936500
Name: CreditCard_Balance_DOLLAR, dtype: float64
In [ ]:
# plot a distribution plot
sns.displot(training_window['CreditCard_Balance_DOLLAR'], kind='kde')

# display the plot
plt.show()
No description has been provided for this image

quitando outliers Variable: CreditCard_Total_Limit¶

In [ ]:
p95 = training_window['CreditCard_Total_Limit'].quantile(0.95)
p99 = training_window['CreditCard_Total_Limit'].quantile(0.99)
three_sigma = 3 * training_window['CreditCard_Total_Limit'].std()

print('p95         ', p95)
print('p99         ', p99)
print('Three sigma ', three_sigma)
p95          160000.0
p99          280000.0
Three sigma  184838.11811993475
In [ ]:
training_window['CreditCard_Total_Limit'] = np.where(training_window['CreditCard_Total_Limit'] > p99, 
                                                          p99,
                                                         training_window['CreditCard_Total_Limit'])

training_window['CreditCard_Total_Limit'].describe()
Out[ ]:
count    139146.000000
mean      51060.982709
std       56753.331259
min           0.000000
25%           0.000000
50%       40000.000000
75%       64000.000000
max      280000.000000
Name: CreditCard_Total_Limit, dtype: float64
In [ ]:
# plot a distribution plot
sns.displot(training_window['CreditCard_Total_Limit'], kind='kde')

# display the plot
plt.show()
No description has been provided for this image

quitando outliers Variable: CreditCard_Total_Spending¶

In [ ]:
p95 = training_window['CreditCard_Total_Spending'].quantile(0.95)
p99 = training_window['CreditCard_Total_Spending'].quantile(0.99)
three_sigma = 3 * training_window['CreditCard_Total_Spending'].std()

print('p95         ', p95)
print('p99         ', p99)
print('Three sigma ', three_sigma)
p95          17131.6725
p99          33148.23549999999
Three sigma  21870.01520929554
In [ ]:
training_window['CreditCard_Total_Spending'] = np.where(training_window['CreditCard_Total_Spending'] > p99, 
                                                          p99,
                                                         training_window['CreditCard_Total_Spending'])

training_window['CreditCard_Total_Spending'].describe()
Out[ ]:
count    139146.000000
mean       4076.655867
std        6249.673704
min      -30164.770000
25%           0.000000
50%        1482.710000
75%        5656.147500
max       33148.235500
Name: CreditCard_Total_Spending, dtype: float64
In [ ]:
# plot a distribution plot
sns.displot(training_window['CreditCard_Total_Spending'], kind='kde')

# display the plot
plt.show()
No description has been provided for this image

quitando outliers Variable: CreditCard_Spending_1_Installment¶

In [ ]:
p95 = training_window['CreditCard_Spending_1_Installment'].quantile(0.95)
p99 = training_window['CreditCard_Spending_1_Installment'].quantile(0.99)
np99 = -training_window['CreditCard_Spending_1_Installment'].quantile(0.99)
three_sigma = 3 * training_window['CreditCard_Spending_1_Installment'].std()

print('p95         ', p95)
print('p99         ', p99)
print('np99         ', np99)
print('Three sigma ', three_sigma)
p95          7739.59
p99          19681.75649999997
np99          -19681.75649999997
Three sigma  12897.110532905726
In [ ]:
training_window['CreditCard_Spending_1_Installment'] = np.where(training_window['CreditCard_Spending_1_Installment'] > p99, 
                                                          p99,
                                                         training_window['CreditCard_Spending_1_Installment'])

training_window['CreditCard_Spending_1_Installment'] = np.where(training_window['CreditCard_Spending_1_Installment'] < np99, 
                                                          np99,
                                                         training_window['CreditCard_Spending_1_Installment'])

training_window['CreditCard_Spending_1_Installment'].describe()
Out[ ]:
count    139146.000000
mean       1268.888989
std        3255.660124
min      -19681.756500
25%           0.000000
50%           0.000000
75%         656.180000
max       19681.756500
Name: CreditCard_Spending_1_Installment, dtype: float64
In [ ]:
# plot a distribution plot
sns.displot(training_window['CreditCard_Spending_1_Installment'], kind='kde')

# display the plot
plt.show()
No description has been provided for this image

quitando outliers Variable: CreditCard_Spending_CrossBoarder¶

In [ ]:
p95 = training_window['CreditCard_Spending_CrossBoarder'].quantile(0.95)
p99 = training_window['CreditCard_Spending_CrossBoarder'].quantile(0.99)
three_sigma = 3 * training_window['CreditCard_Spending_CrossBoarder'].std()

print('p95         ', p95)
print('p99         ', p99)
print('Three sigma ', three_sigma)
p95          0.99
p99          143.6204999999987
Three sigma  212.90138519851982
In [ ]:
training_window['CreditCard_Spending_CrossBoarder'] = np.where(training_window['CreditCard_Spending_CrossBoarder'] > p99, 
                                                          p99,
                                                         training_window['CreditCard_Spending_CrossBoarder'])

training_window['CreditCard_Spending_CrossBoarder'].describe()
Out[ ]:
count    139146.000000
mean          2.589213
std          16.595921
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max         143.620500
Name: CreditCard_Spending_CrossBoarder, dtype: float64
In [ ]:
# plot a distribution plot
sns.displot(training_window['CreditCard_Spending_CrossBoarder'], kind='kde')

# display the plot
plt.show()
No description has been provided for this image

quitando outliers Variable: CreditCard_Spending_Aut_Debits¶

In [ ]:
p95 = training_window['CreditCard_Spending_Aut_Debits'].quantile(0.95)
p99 = training_window['CreditCard_Spending_Aut_Debits'].quantile(0.99)
three_sigma = 3 * training_window['CreditCard_Spending_Aut_Debits'].std()

print('p95         ', p95)
print('p99         ', p99)
print('Three sigma ', three_sigma)
p95          5619.7275
p99          12510.912499999991
Three sigma  7806.871328247001
In [ ]:
training_window['CreditCard_Spending_Aut_Debits'] = np.where(training_window['CreditCard_Spending_Aut_Debits'] > p99, 
                                                          p99,
                                                         training_window['CreditCard_Spending_Aut_Debits'])

training_window['CreditCard_Spending_Aut_Debits'].describe()
Out[ ]:
count    139146.000000
mean       1033.300702
std        2209.574033
min       -9476.360000
25%           0.000000
50%           0.000000
75%         947.745000
max       12510.912500
Name: CreditCard_Spending_Aut_Debits, dtype: float64
In [ ]:
# plot a distribution plot
sns.displot(training_window['CreditCard_Spending_Aut_Debits'], kind='kde')

# display the plot
plt.show()
No description has been provided for this image
In [ ]:
## quitando outliers Variable: CreditCard_Revolving
In [ ]:
p95 = training_window['CreditCard_Revolving'].quantile(0.95)
p99 = training_window['CreditCard_Revolving'].quantile(0.99)
np99 = -training_window['CreditCard_Revolving'].quantile(0.99)
three_sigma = 3 * training_window['CreditCard_Revolving'].std()

print('p95         ', p95)
print('p99         ', p99)
print('np99         ', np99)
print('Three sigma ', three_sigma)
p95          17291.8975
p99          36175.25699999951
np99          -36175.25699999951
Three sigma  23840.512630865414
In [ ]:
training_window['CreditCard_Revolving'] = np.where(training_window['CreditCard_Revolving'] > p99, 
                                                          p99,
                                                         training_window['CreditCard_Revolving'])

training_window['CreditCard_Revolving'] = np.where(training_window['CreditCard_Revolving'] < np99, 
                                                          np99,
                                                         training_window['CreditCard_Revolving'])
training_window['CreditCard_Revolving'].describe()
Out[ ]:
count    139146.000000
mean       2040.762848
std        6810.370574
min      -36175.257000
25%           0.000000
50%           0.000000
75%           0.000000
max       36175.257000
Name: CreditCard_Revolving, dtype: float64
In [ ]:
# plot a distribution plot
sns.displot(training_window['CreditCard_Revolving'], kind='kde')

# display the plot
plt.show()
No description has been provided for this image

Transform Features¶

In [ ]:
training_window
Out[ ]:
client_id Target Month First_product_dt Last_product_dt CreditCard_Premium CreditCard_Active CreditCard_CoBranding Loan_Active Mortgage_Active ... CreditCard_Payment_Web CreditCard_Payment_ATM CreditCard_Payment_TAS Investment_Numbers Mobile Email Region CreditCard_Product cantidad_meses TGT
0 5856970 1.0 2018-10-01 2013-10-23 2019-01-10 No Yes No No No ... 0.0 0.0 0.0 1.0 Yes Yes AMBA Resto J55660202XX012 9 1.0
1 6371753 0.0 2018-09-01 2015-07-29 2018-06-02 No No No No No ... 0.0 0.0 0.0 0.0 Yes No REGION CENTRO No 9 0.0
2 5928737 0.0 2019-01-01 2016-08-31 2018-12-27 No No No No No ... 0.0 0.0 0.0 0.0 Yes Yes REGION NORTE GRANDE ARGENTINO No 9 0.0
3 475064 0.0 2018-12-01 2014-07-13 2017-11-30 No Yes No No No ... 0.0 0.0 0.0 0.0 Yes Yes REGION CUYO J55660202XX012 9 0.0
4 3615172 0.0 2018-09-01 2017-12-27 2017-12-28 No No No No No ... 0.0 0.0 0.0 0.0 Yes No REGION CENTRO No 9 0.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
139141 1673642 0.0 2018-11-01 2017-08-18 2017-09-26 No Yes No No No ... 1.0 0.0 0.0 0.0 No Yes BUENOS AIRES J55660104XX012 9 0.0
139142 6145735 1.0 2018-11-01 2014-10-26 2014-10-26 No Yes No No No ... 1.0 0.0 0.0 0.0 Yes Yes REGION PATAGONICA J55660202XX012 9 1.0
139143 5638786 1.0 2018-11-01 2012-12-26 2017-03-08 No Yes No No No ... 0.0 0.0 1.0 0.0 Yes No REGION NORTE GRANDE ARGENTINO J55660202XX012 9 1.0
139144 3824781 0.0 2018-08-01 2014-11-27 2019-01-04 No No No No No ... 0.0 0.0 0.0 0.0 Yes Yes BUENOS AIRES No 9 0.0
139145 6412619 0.0 2019-01-01 2015-07-08 2018-06-02 No No No Yes No ... 0.0 0.0 0.0 0.0 Yes No REGION PATAGONICA No 9 0.0

139146 rows × 79 columns

variables con valores (Yes/No), (M/F) y las edades¶

In [ ]:
columnas = ['CreditCard_Premium','CreditCard_Active','CreditCard_CoBranding','Loan_Active',
            'Mortgage_Active', 'SavingAccount_Active_ARG_Salary','SavingAccount_Active_ARG','SavingAccount_Active_DOLLAR'
           ,'DebitCard_Active','Investment_Active','Package_Active','Insurance_Life'
           ,'Insurance_Home','Insurance_Accidents','Insurance_Mobile','Insurance_ATM','Insurance_Unemployment','Mobile','Email']

for columna in columnas:
    training_window[columna] = np.where(training_window[columna] == 'Yes',1,0)
#----------    

training_window['Sex'] = np.where(training_window['Sex'] == 'F', 0, 1)

#---------------

di = { 
 "Entre 40 y 49 años"          : 40,
 "Entre 30 y 39 años"                 : 30,
    "Entre 50 y 59 años"                 : 50,
    "Entre 60 y 64 años"                 : 60,
    "Entre 65 y 69 años"                 : 65,
    "Entre 18 y 29 años"                 : 18,
    "Mayor a 70 años"                 : 70,
    "Menor a 18 años"                 : 17,
 }
training_window.Client_Age_grp = training_window.Client_Age_grp.map(di)
In [ ]:
training_window['Client_Age_grp'].value_counts()
Out[ ]:
Client_Age_grp
40    38313
30    34063
50    30588
60    12282
65     9685
18     7509
70     6701
17        5
Name: count, dtype: int64
In [ ]:
training_window
Out[ ]:
client_id Target Month First_product_dt Last_product_dt CreditCard_Premium CreditCard_Active CreditCard_CoBranding Loan_Active Mortgage_Active ... CreditCard_Payment_Web CreditCard_Payment_ATM CreditCard_Payment_TAS Investment_Numbers Mobile Email Region CreditCard_Product cantidad_meses TGT
0 5856970 1.0 2018-10-01 2013-10-23 2019-01-10 0 1 0 0 0 ... 0.0 0.0 0.0 1.0 1 1 AMBA Resto J55660202XX012 9 1.0
1 6371753 0.0 2018-09-01 2015-07-29 2018-06-02 0 0 0 0 0 ... 0.0 0.0 0.0 0.0 1 0 REGION CENTRO No 9 0.0
2 5928737 0.0 2019-01-01 2016-08-31 2018-12-27 0 0 0 0 0 ... 0.0 0.0 0.0 0.0 1 1 REGION NORTE GRANDE ARGENTINO No 9 0.0
3 475064 0.0 2018-12-01 2014-07-13 2017-11-30 0 1 0 0 0 ... 0.0 0.0 0.0 0.0 1 1 REGION CUYO J55660202XX012 9 0.0
4 3615172 0.0 2018-09-01 2017-12-27 2017-12-28 0 0 0 0 0 ... 0.0 0.0 0.0 0.0 1 0 REGION CENTRO No 9 0.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
139141 1673642 0.0 2018-11-01 2017-08-18 2017-09-26 0 1 0 0 0 ... 1.0 0.0 0.0 0.0 0 1 BUENOS AIRES J55660104XX012 9 0.0
139142 6145735 1.0 2018-11-01 2014-10-26 2014-10-26 0 1 0 0 0 ... 1.0 0.0 0.0 0.0 1 1 REGION PATAGONICA J55660202XX012 9 1.0
139143 5638786 1.0 2018-11-01 2012-12-26 2017-03-08 0 1 0 0 0 ... 0.0 0.0 1.0 0.0 1 0 REGION NORTE GRANDE ARGENTINO J55660202XX012 9 1.0
139144 3824781 0.0 2018-08-01 2014-11-27 2019-01-04 0 0 0 0 0 ... 0.0 0.0 0.0 0.0 1 1 BUENOS AIRES No 9 0.0
139145 6412619 0.0 2019-01-01 2015-07-08 2018-06-02 0 0 0 1 0 ... 0.0 0.0 0.0 0.0 1 0 REGION PATAGONICA No 9 0.0

139146 rows × 79 columns

One Hot Encoding en variable Region¶

In [ ]:
training_window = pd.get_dummies(training_window, columns = ['Region'])

training_window
Out[ ]:
client_id Target Month First_product_dt Last_product_dt CreditCard_Premium CreditCard_Active CreditCard_CoBranding Loan_Active Mortgage_Active ... CreditCard_Product cantidad_meses TGT Region_AMBA Resto Region_BUENOS AIRES Region_CABA Centro/Norte Region_REGION CENTRO Region_REGION CUYO Region_REGION NORTE GRANDE ARGENTINO Region_REGION PATAGONICA
0 5856970 1.0 2018-10-01 2013-10-23 2019-01-10 0 1 0 0 0 ... J55660202XX012 9 1.0 True False False False False False False
1 6371753 0.0 2018-09-01 2015-07-29 2018-06-02 0 0 0 0 0 ... No 9 0.0 False False False True False False False
2 5928737 0.0 2019-01-01 2016-08-31 2018-12-27 0 0 0 0 0 ... No 9 0.0 False False False False False True False
3 475064 0.0 2018-12-01 2014-07-13 2017-11-30 0 1 0 0 0 ... J55660202XX012 9 0.0 False False False False True False False
4 3615172 0.0 2018-09-01 2017-12-27 2017-12-28 0 0 0 0 0 ... No 9 0.0 False False False True False False False
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
139141 1673642 0.0 2018-11-01 2017-08-18 2017-09-26 0 1 0 0 0 ... J55660104XX012 9 0.0 False True False False False False False
139142 6145735 1.0 2018-11-01 2014-10-26 2014-10-26 0 1 0 0 0 ... J55660202XX012 9 1.0 False False False False False False True
139143 5638786 1.0 2018-11-01 2012-12-26 2017-03-08 0 1 0 0 0 ... J55660202XX012 9 1.0 False False False False False True False
139144 3824781 0.0 2018-08-01 2014-11-27 2019-01-04 0 0 0 0 0 ... No 9 0.0 False True False False False False False
139145 6412619 0.0 2019-01-01 2015-07-08 2018-06-02 0 0 0 1 0 ... No 9 0.0 False False False False False False True

139146 rows × 85 columns

In [ ]:
columnas = ['Region_AMBA Resto','Region_BUENOS AIRES','Region_CABA Centro/Norte',
            'Region_REGION CENTRO', 'Region_REGION CUYO','Region_REGION NORTE GRANDE ARGENTINO','Region_REGION PATAGONICA']

for columna in columnas:
    training_window[columna] = np.where(training_window[columna] == True,1,0)
    
training_window    
Out[ ]:
client_id Target Month First_product_dt Last_product_dt CreditCard_Premium CreditCard_Active CreditCard_CoBranding Loan_Active Mortgage_Active ... CreditCard_Product cantidad_meses TGT Region_AMBA Resto Region_BUENOS AIRES Region_CABA Centro/Norte Region_REGION CENTRO Region_REGION CUYO Region_REGION NORTE GRANDE ARGENTINO Region_REGION PATAGONICA
0 5856970 1.0 2018-10-01 2013-10-23 2019-01-10 0 1 0 0 0 ... J55660202XX012 9 1.0 1 0 0 0 0 0 0
1 6371753 0.0 2018-09-01 2015-07-29 2018-06-02 0 0 0 0 0 ... No 9 0.0 0 0 0 1 0 0 0
2 5928737 0.0 2019-01-01 2016-08-31 2018-12-27 0 0 0 0 0 ... No 9 0.0 0 0 0 0 0 1 0
3 475064 0.0 2018-12-01 2014-07-13 2017-11-30 0 1 0 0 0 ... J55660202XX012 9 0.0 0 0 0 0 1 0 0
4 3615172 0.0 2018-09-01 2017-12-27 2017-12-28 0 0 0 0 0 ... No 9 0.0 0 0 0 1 0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
139141 1673642 0.0 2018-11-01 2017-08-18 2017-09-26 0 1 0 0 0 ... J55660104XX012 9 0.0 0 1 0 0 0 0 0
139142 6145735 1.0 2018-11-01 2014-10-26 2014-10-26 0 1 0 0 0 ... J55660202XX012 9 1.0 0 0 0 0 0 0 1
139143 5638786 1.0 2018-11-01 2012-12-26 2017-03-08 0 1 0 0 0 ... J55660202XX012 9 1.0 0 0 0 0 0 1 0
139144 3824781 0.0 2018-08-01 2014-11-27 2019-01-04 0 0 0 0 0 ... No 9 0.0 0 1 0 0 0 0 0
139145 6412619 0.0 2019-01-01 2015-07-08 2018-06-02 0 0 0 1 0 ... No 9 0.0 0 0 0 0 0 0 1

139146 rows × 85 columns

Identity Features to ABT¶

In [ ]:
training_window.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 139146 entries, 0 to 139145
Data columns (total 85 columns):
 #   Column                                         Non-Null Count   Dtype  
---  ------                                         --------------   -----  
 0   client_id                                      139146 non-null  int32  
 1   Target                                         139146 non-null  float64
 2   Month                                          139146 non-null  object 
 3   First_product_dt                               139146 non-null  object 
 4   Last_product_dt                                139146 non-null  object 
 5   CreditCard_Premium                             139146 non-null  int32  
 6   CreditCard_Active                              139146 non-null  int32  
 7   CreditCard_CoBranding                          139146 non-null  int32  
 8   Loan_Active                                    139146 non-null  int32  
 9   Mortgage_Active                                139146 non-null  int32  
 10  SavingAccount_Active_ARG_Salary                139146 non-null  int32  
 11  SavingAccount_Active_ARG                       139146 non-null  int32  
 12  SavingAccount_Active_DOLLAR                    139146 non-null  int32  
 13  DebitCard_Active                               139146 non-null  int32  
 14  Investment_Active                              139146 non-null  int32  
 15  Package_Active                                 139146 non-null  int32  
 16  Insurance_Life                                 139146 non-null  int32  
 17  Insurance_Home                                 139146 non-null  int32  
 18  Insurance_Accidents                            139146 non-null  int32  
 19  Insurance_Mobile                               139146 non-null  int32  
 20  Insurance_ATM                                  139146 non-null  int32  
 21  Insurance_Unemployment                         139146 non-null  int32  
 22  Sex                                            139146 non-null  int32  
 23  Client_Age_grp                                 139146 non-null  int64  
 24  SavingAccount_Balance_FirstDate                139146 non-null  float64
 25  SavingAccount_Balance_LastDate                 139146 non-null  float64
 26  SavingAccount_Balance_Average                  139146 non-null  float64
 27  SavingAccount_Days_with_use                    139146 non-null  float64
 28  SavingAccount_Days_with_Credits                139146 non-null  float64
 29  SavingAccount_Days_with_Debits                 139146 non-null  float64
 30  SavingAccount_Salary_Payment_Transactions      139146 non-null  float64
 31  SavingAccount_Transfer_In_Transactions         139146 non-null  float64
 32  SavingAccount_ATM_Extraction_Transactions      139146 non-null  float64
 33  SavingAccount_Service_Payment_Transactions     139146 non-null  float64
 34  SavingAccount_CreditCard_Payment_Transactions  139146 non-null  float64
 35  SavingAccount_Transfer_Out_Transactions        139146 non-null  float64
 36  SavingAccount_DebitCard_Spend_Transactions     139146 non-null  float64
 37  SavingAccount_Transactions_Transactions        139146 non-null  float64
 38  SavingAccount_Credits_Transactions             139146 non-null  float64
 39  SavingAccount_Debits_Transactions              139146 non-null  float64
 40  SavingAccount_Salary_Payment_Amount            139146 non-null  float64
 41  SavingAccount_Transfer_In_Amount               139146 non-null  float64
 42  SavingAccount_ATM_Extraction_Amount            139146 non-null  float64
 43  SavingAccount_Service_Payment_Amount           139146 non-null  float64
 44  SavingAccount_CreditCard_Payment_Amount        139146 non-null  float64
 45  SavingAccount_Transfer_Out_Amount              139146 non-null  float64
 46  SavingAccount_DebitCard_Spend_Amount           139146 non-null  float64
 47  SavingAccount_Total_Amount                     139146 non-null  float64
 48  SavingAccount_Credits_Amounts                  139146 non-null  float64
 49  SavingAccount_Debits_Amounts                   139146 non-null  float64
 50  Operations_Bank                                139146 non-null  float64
 51  Operations_Terminal                            139146 non-null  float64
 52  Operations_HomeBanking                         139146 non-null  float64
 53  Operations_Mobile                              139146 non-null  float64
 54  Operations_Ivr                                 139146 non-null  float64
 55  Operations_Telemarketer                        139146 non-null  float64
 56  Operations_ATM                                 139146 non-null  float64
 57  CreditCard_Balance_ARG                         139146 non-null  float64
 58  CreditCard_Balance_DOLLAR                      139146 non-null  float64
 59  CreditCard_Total_Limit                         139146 non-null  float64
 60  CreditCard_Total_Spending                      139146 non-null  float64
 61  CreditCard_Spending_1_Installment              139146 non-null  float64
 62  CreditCard_Spending_Installments               139146 non-null  float64
 63  CreditCard_Spending_CrossBoarder               139146 non-null  float64
 64  CreditCard_Spending_Aut_Debits                 139146 non-null  float64
 65  CreditCard_Revolving                           139146 non-null  float64
 66  CreditCard_Payment_Aut_Debit                   139146 non-null  float64
 67  CreditCard_Payment_External                    139146 non-null  float64
 68  CreditCard_Payment_Cash                        139146 non-null  float64
 69  CreditCard_Payment_Web                         139146 non-null  float64
 70  CreditCard_Payment_ATM                         139146 non-null  float64
 71  CreditCard_Payment_TAS                         139146 non-null  float64
 72  Investment_Numbers                             139146 non-null  float64
 73  Mobile                                         139146 non-null  int32  
 74  Email                                          139146 non-null  int32  
 75  CreditCard_Product                             139146 non-null  object 
 76  cantidad_meses                                 139146 non-null  int64  
 77  TGT                                            139146 non-null  float64
 78  Region_AMBA Resto                              139146 non-null  int32  
 79  Region_BUENOS AIRES                            139146 non-null  int32  
 80  Region_CABA Centro/Norte                       139146 non-null  int32  
 81  Region_REGION CENTRO                           139146 non-null  int32  
 82  Region_REGION CUYO                             139146 non-null  int32  
 83  Region_REGION NORTE GRANDE ARGENTINO           139146 non-null  int32  
 84  Region_REGION PATAGONICA                       139146 non-null  int32  
dtypes: float64(51), int32(28), int64(2), object(4)
memory usage: 75.4+ MB
In [ ]:
identity_features = training_window[training_window.Month == '2019-01-01'][['client_id',
                                                                             'Target',
                                                                             'Month',
                                                                             'First_product_dt',
                                                                             'Last_product_dt',
                                                                             'CreditCard_Premium',
                                                                             'CreditCard_Active',
                                                                             'Loan_Active',
                                                                             'Mortgage_Active',
                                                                             'DebitCard_Active',
                                                                             'Investment_Active',
                                                                             'Sex',
                                                                             'Client_Age_grp',
                                                                             'Mobile',
                                                                             'Email',
                                                                             'CreditCard_Product',
                                                                             'Region_AMBA Resto',
                                                                             'Region_BUENOS AIRES',
                                                                             'Region_CABA Centro/Norte',
                                                                             'Region_REGION CENTRO',
                                                                             'Region_REGION CUYO',
                                                                             'Region_REGION NORTE GRANDE ARGENTINO',
                                                                             'Region_REGION PATAGONICA',
                                                                             'SavingAccount_Active_ARG_Salary',
                                                                             'SavingAccount_Active_ARG',
                                                                             'SavingAccount_Active_DOLLAR',
                                                                             'SavingAccount_Days_with_Credits',
                               'SavingAccount_Days_with_Debits',
                               'SavingAccount_Salary_Payment_Transactions',
                               'SavingAccount_Transfer_In_Transactions',
                               'SavingAccount_ATM_Extraction_Transactions',
                               'SavingAccount_CreditCard_Payment_Transactions',
                               'SavingAccount_Transfer_Out_Transactions',
                               'SavingAccount_DebitCard_Spend_Transactions',
                               'SavingAccount_Transactions_Transactions',
                               'SavingAccount_Credits_Transactions',
                               'SavingAccount_Debits_Transactions',
                               'SavingAccount_Salary_Payment_Amount',
                               'SavingAccount_Transfer_In_Amount',
                               'SavingAccount_ATM_Extraction_Amount',
                               'SavingAccount_CreditCard_Payment_Amount',
                               'SavingAccount_Transfer_Out_Amount',
                               'SavingAccount_DebitCard_Spend_Amount',
                               'SavingAccount_Total_Amount',
                               'SavingAccount_Credits_Amounts',
                               'SavingAccount_Debits_Amounts',
                               'Operations_HomeBanking',
                               'Operations_Mobile',
                               'CreditCard_Balance_ARG',
                               'CreditCard_Balance_DOLLAR',
                               'CreditCard_Total_Limit',
                               'CreditCard_Total_Spending',
                               'CreditCard_Spending_1_Installment',
                               'CreditCard_Spending_CrossBoarder',
                               'CreditCard_Spending_Aut_Debits',
                               'CreditCard_Revolving',
                                'TGT']].copy()

identity_features
Out[ ]:
client_id Target Month First_product_dt Last_product_dt CreditCard_Premium CreditCard_Active Loan_Active Mortgage_Active DebitCard_Active ... Operations_Mobile CreditCard_Balance_ARG CreditCard_Balance_DOLLAR CreditCard_Total_Limit CreditCard_Total_Spending CreditCard_Spending_1_Installment CreditCard_Spending_CrossBoarder CreditCard_Spending_Aut_Debits CreditCard_Revolving TGT
2 5928737 0.0 2019-01-01 2016-08-31 2018-12-27 0 0 0 0 1 ... 0.0 0.00 0.00 0.0 0.00 0.00 0.00 0.0000 0.00 0.0
8 6018047 1.0 2019-01-01 2014-04-29 2017-05-31 1 1 0 0 1 ... 0.0 16397.20 0.00 80000.0 16068.08 1000.00 0.00 12510.9125 -14.31 1.0
9 5359038 1.0 2019-01-01 2016-01-07 2017-05-24 1 1 1 0 1 ... 0.0 6906.73 33.67 64000.0 6919.64 1840.00 33.57 299.0000 0.00 1.0
11 6890812 0.0 2019-01-01 2017-06-09 2018-08-27 0 0 1 0 1 ... 0.0 0.00 0.00 0.0 0.00 0.00 0.00 0.0000 0.00 0.0
13 115383 0.0 2019-01-01 2004-07-30 2004-07-30 0 0 0 0 0 ... 0.0 0.00 0.00 0.0 0.00 0.00 0.00 0.0000 0.00 0.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
139131 6570413 0.0 2019-01-01 2015-12-04 2018-11-02 0 1 0 0 1 ... 1.0 537.43 0.00 28000.0 0.00 0.00 0.00 0.0000 0.00 0.0
139132 6258895 0.0 2019-01-01 2015-03-12 2018-03-02 0 0 0 0 1 ... 0.0 0.00 0.00 0.0 0.00 0.00 0.00 0.0000 0.00 0.0
139138 6397274 0.0 2019-01-01 2015-06-23 2015-06-23 0 1 0 0 0 ... 0.0 1475.18 0.00 40000.0 1384.27 0.00 0.00 459.0000 -77.32 0.0
139139 6007291 0.0 2019-01-01 2014-04-13 2014-04-13 1 1 0 0 0 ... 0.0 33148.43 0.00 96000.0 32977.86 11385.99 0.00 12510.9125 0.00 0.0
139145 6412619 0.0 2019-01-01 2015-07-08 2018-06-02 0 0 1 0 1 ... 0.0 0.00 0.00 0.0 0.00 0.00 0.00 0.0000 0.00 0.0

23191 rows × 57 columns

Tratamiento de variables con fechas¶

In [ ]:
dateColumns = ['Month','First_product_dt','Last_product_dt']

identity_features.drop(columns=dateColumns, inplace=True)

identity_features
Out[ ]:
client_id Target CreditCard_Premium CreditCard_Active Loan_Active Mortgage_Active DebitCard_Active Investment_Active Sex Client_Age_grp ... Operations_Mobile CreditCard_Balance_ARG CreditCard_Balance_DOLLAR CreditCard_Total_Limit CreditCard_Total_Spending CreditCard_Spending_1_Installment CreditCard_Spending_CrossBoarder CreditCard_Spending_Aut_Debits CreditCard_Revolving TGT
2 5928737 0.0 0 0 0 0 1 0 1 30 ... 0.0 0.00 0.00 0.0 0.00 0.00 0.00 0.0000 0.00 0.0
8 6018047 1.0 1 1 0 0 1 0 1 60 ... 0.0 16397.20 0.00 80000.0 16068.08 1000.00 0.00 12510.9125 -14.31 1.0
9 5359038 1.0 1 1 1 0 1 0 1 40 ... 0.0 6906.73 33.67 64000.0 6919.64 1840.00 33.57 299.0000 0.00 1.0
11 6890812 0.0 0 0 1 0 1 0 1 40 ... 0.0 0.00 0.00 0.0 0.00 0.00 0.00 0.0000 0.00 0.0
13 115383 0.0 0 0 0 0 0 0 1 70 ... 0.0 0.00 0.00 0.0 0.00 0.00 0.00 0.0000 0.00 0.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
139131 6570413 0.0 0 1 0 0 1 0 0 30 ... 1.0 537.43 0.00 28000.0 0.00 0.00 0.00 0.0000 0.00 0.0
139132 6258895 0.0 0 0 0 0 1 0 0 30 ... 0.0 0.00 0.00 0.0 0.00 0.00 0.00 0.0000 0.00 0.0
139138 6397274 0.0 0 1 0 0 0 0 1 40 ... 0.0 1475.18 0.00 40000.0 1384.27 0.00 0.00 459.0000 -77.32 0.0
139139 6007291 0.0 1 1 0 0 0 0 0 40 ... 0.0 33148.43 0.00 96000.0 32977.86 11385.99 0.00 12510.9125 0.00 0.0
139145 6412619 0.0 0 0 1 0 1 0 0 60 ... 0.0 0.00 0.00 0.0 0.00 0.00 0.00 0.0000 0.00 0.0

23191 rows × 54 columns

Transformar varible CreditCard_Product¶

In [ ]:
identity_features.CreditCard_Product.value_counts()
Out[ ]:
CreditCard_Product
No                8426
J55660104XX012    8262
J55660202XX012    5759
J55660102XX012     402
J55660702XX012     249
J55661002XX012      62
J55660124XX012      30
J55660123XX012       1
Name: count, dtype: int64
In [ ]:
CreditCard_Product_Transform = identity_features[['client_id','CreditCard_Product','TGT']].groupby('CreditCard_Product').agg(['sum',np.count_nonzero])
CreditCard_Product_Transform.columns = ['_'.join(x) for x in np.array(CreditCard_Product_Transform.columns)]

CreditCard_Product_Transform['Porcent_TGT'] = CreditCard_Product_Transform['TGT_sum']/CreditCard_Product_Transform['client_id_count_nonzero']

CreditCard_Product_Transform
Out[ ]:
client_id_sum client_id_count_nonzero TGT_sum TGT_count_nonzero Porcent_TGT
CreditCard_Product
J55660102XX012 1062656436 402 138.0 138 0.343284
J55660104XX012 34506107357 8262 3880.0 3880 0.469620
J55660123XX012 994616 1 0.0 0 0.000000
J55660124XX012 107695565 30 11.0 11 0.366667
J55660202XX012 23762080087 5759 2128.0 2128 0.369509
J55660702XX012 1060543322 249 108.0 108 0.433735
J55661002XX012 204312610 62 40.0 40 0.645161
No 40418495289 8426 518.0 518 0.061476
In [ ]:
di = { 
 "No"                     : 6,
 "J55660102XX012"         : 35,
 "J55660104XX012"         : 48,
 "J55660123XX012"         : 0,
 "J55660124XX012"         : 35,
 "J55660202XX012"         : 35,
 "J55660702XX012"         : 48,
 "J55661002XX012"         : 48,
 }
identity_features.CreditCard_Product = identity_features.CreditCard_Product.map(di)

identity_features.CreditCard_Product.value_counts()
Out[ ]:
CreditCard_Product
48    8573
6     8426
35    6191
0        1
Name: count, dtype: int64

Aggregate Features to ABT¶

In [ ]:
columns=[x for x in training_window if(x in ([                                             
                               'SavingAccount_Days_with_Credits',
                               'SavingAccount_Days_with_Debits',
                               'SavingAccount_Salary_Payment_Transactions',
                               'SavingAccount_Transfer_In_Transactions',
                               'SavingAccount_ATM_Extraction_Transactions',
                               'SavingAccount_CreditCard_Payment_Transactions',
                               'SavingAccount_Transfer_Out_Transactions',
                               'SavingAccount_DebitCard_Spend_Transactions',
                               'SavingAccount_Transactions_Transactions',
                               'SavingAccount_Credits_Transactions',
                               'SavingAccount_Debits_Transactions',
                               'SavingAccount_Salary_Payment_Amount',
                               'SavingAccount_Transfer_In_Amount',
                               'SavingAccount_ATM_Extraction_Amount',
                               'SavingAccount_CreditCard_Payment_Amount',
                               'SavingAccount_Transfer_Out_Amount',
                               'SavingAccount_DebitCard_Spend_Amount',
                               'SavingAccount_Total_Amount',
                               'SavingAccount_Credits_Amounts',
                               'SavingAccount_Debits_Amounts',
                               'Operations_HomeBanking',
                               'Operations_Mobile',
                               'CreditCard_Balance_ARG',
                               'CreditCard_Balance_DOLLAR',
                               'CreditCard_Total_Limit',
                               'CreditCard_Total_Spending',
                               'CreditCard_Spending_1_Installment',
                               'CreditCard_Spending_CrossBoarder',
                               'CreditCard_Spending_Aut_Debits',
                               'CreditCard_Revolving']))]
len(columns)
Out[ ]:
30
In [ ]:
aggregateFeatures = training_window.groupby('client_id')[columns].agg(['sum','max','min', 'mean','nunique', np.count_nonzero,'var']).reset_index()
aggregateFeatures.columns = ['_'.join(x) for x in np.array(aggregateFeatures.columns)]

aggregateFeatures.rename(columns={'client_id_':'client_id'}, inplace=True)
In [ ]:
aggregateFeatures
Out[ ]:
client_id SavingAccount_Days_with_Credits_sum SavingAccount_Days_with_Credits_max SavingAccount_Days_with_Credits_min SavingAccount_Days_with_Credits_mean SavingAccount_Days_with_Credits_nunique SavingAccount_Days_with_Credits_count_nonzero SavingAccount_Days_with_Credits_var SavingAccount_Days_with_Debits_sum SavingAccount_Days_with_Debits_max ... CreditCard_Spending_Aut_Debits_nunique CreditCard_Spending_Aut_Debits_count_nonzero CreditCard_Spending_Aut_Debits_var CreditCard_Revolving_sum CreditCard_Revolving_max CreditCard_Revolving_min CreditCard_Revolving_mean CreditCard_Revolving_nunique CreditCard_Revolving_count_nonzero CreditCard_Revolving_var
0 1030 13.0 3.0 1.0 2.166667 3 6 0.566667 17.000000 4.000000 ... 2 6 337.500000 0.00 0.00 0.00 0.000000 1 0 0.000000e+00
1 1094 0.0 0.0 0.0 0.000000 1 0 0.000000 0.000000 0.000000 ... 1 0 0.000000 7824.34 4121.95 -86.97 1304.056667 6 6 2.834001e+06
2 1553 0.0 0.0 0.0 0.000000 1 0 0.000000 0.000000 0.000000 ... 1 0 0.000000 0.00 0.00 0.00 0.000000 1 0 0.000000e+00
3 1590 0.0 0.0 0.0 0.000000 1 0 0.000000 0.000000 0.000000 ... 3 6 1414.566667 2874.30 2486.88 -45.10 479.050000 5 4 9.991904e+05
4 1948 8.0 2.0 1.0 1.333333 2 6 0.266667 11.000000 2.000000 ... 6 6 187224.700000 20378.64 9969.35 -0.66 3396.440000 5 4 1.874074e+07
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
23186 7344944 7.0 5.0 0.0 1.166667 3 2 4.166667 8.000000 6.000000 ... 1 0 0.000000 0.00 0.00 0.00 0.000000 1 0 0.000000e+00
23187 7345011 8.0 5.0 0.0 1.333333 3 2 4.666667 9.000000 5.000000 ... 1 0 0.000000 0.00 0.00 0.00 0.000000 1 0 0.000000e+00
23188 7345026 15.0 5.0 0.0 2.500000 2 3 7.500000 12.000000 7.000000 ... 1 0 0.000000 0.00 0.00 0.00 0.000000 1 0 0.000000e+00
23189 7345029 6.0 3.0 0.0 1.000000 2 2 2.400000 10.000000 6.000000 ... 1 0 0.000000 0.00 0.00 0.00 0.000000 1 0 0.000000e+00
23190 7345492 10.0 5.0 0.0 1.666667 4 3 4.266667 18.723007 9.723007 ... 1 0 0.000000 0.00 0.00 0.00 0.000000 1 0 0.000000e+00

23191 rows × 211 columns

In [ ]:
Operations = training_window[['client_id','Operations_Bank','Operations_Terminal','Operations_HomeBanking',
                              'Operations_Mobile','Operations_Ivr','Operations_Telemarketer','Operations_ATM']]

Operations['TotalOperations'] = Operations['Operations_Bank'] + Operations['Operations_Terminal'] \
                                + Operations['Operations_HomeBanking'] + Operations['Operations_Mobile'] \
                                + Operations['Operations_Ivr'] + Operations['Operations_Telemarketer'] \
                                + Operations['Operations_ATM']



Operations_Aggregate = Operations.groupby('client_id').agg('sum').reset_index()

Operations_Aggregate['Porcent_Operations_HomeBanking'] = np.where(Operations_Aggregate['TotalOperations'] == 0,0,Operations_Aggregate['Operations_HomeBanking']/Operations_Aggregate['TotalOperations'])
Operations_Aggregate['Porcent_Operations_Mobile'] = np.where(Operations_Aggregate['TotalOperations'] == 0,0,Operations_Aggregate['Operations_Mobile']/Operations_Aggregate['TotalOperations'])

Operations_Aggregate
C:\Users\tutem\AppData\Local\Temp\ipykernel_22168\2919754844.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Operations['TotalOperations'] = Operations['Operations_Bank'] + Operations['Operations_Terminal'] \
Out[ ]:
client_id Operations_Bank Operations_Terminal Operations_HomeBanking Operations_Mobile Operations_Ivr Operations_Telemarketer Operations_ATM TotalOperations Porcent_Operations_HomeBanking Porcent_Operations_Mobile
0 1030 0.0 0.0 6.0 1.0 0.0 1.0 0.0 8.0 0.750000 0.125000
1 1094 0.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 0.000000 0.000000
2 1553 0.0 0.0 1.0 0.0 0.0 0.0 0.0 1.0 1.000000 0.000000
3 1590 1.0 0.0 0.0 0.0 1.0 0.0 0.0 2.0 0.000000 0.000000
4 1948 0.0 0.0 8.0 1.0 0.0 0.0 0.0 9.0 0.888889 0.111111
... ... ... ... ... ... ... ... ... ... ... ...
23186 7344944 0.0 1.0 3.0 30.0 0.0 0.0 8.0 42.0 0.071429 0.714286
23187 7345011 2.0 1.0 1.0 3.0 0.0 0.0 7.0 14.0 0.071429 0.214286
23188 7345026 2.0 1.0 0.0 0.0 0.0 0.0 2.0 5.0 0.000000 0.000000
23189 7345029 2.0 1.0 0.0 0.0 0.0 0.0 6.0 9.0 0.000000 0.000000
23190 7345492 2.0 0.0 11.0 22.0 0.0 0.0 0.0 35.0 0.314286 0.628571

23191 rows × 11 columns

In [ ]:
Insurances = training_window[['client_id','Insurance_Life','Insurance_Home','Insurance_Accidents',
                              'Insurance_Mobile','Insurance_ATM','Insurance_Unemployment']]

Insurances['TotalInsurances'] = Insurances['Insurance_Life'] + Insurances['Insurance_Home'] \
                                + Insurances['Insurance_Accidents'] + Insurances['Insurance_Mobile'] \
                                + Insurances['Insurance_ATM'] + Insurances['Insurance_Unemployment']



Insurances_Aggregate = Insurances.groupby('client_id').agg('sum').reset_index()

Insurances_Aggregate['Porcent_Total_Insurances'] = Insurances_Aggregate['TotalInsurances'] / 5
#Insurances_Aggregate = Insurances_Aggregate[['client_id','TotalInsurances','Porcent_Total_Insurances']]

Insurances_Aggregate
C:\Users\tutem\AppData\Local\Temp\ipykernel_22168\4288838385.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Insurances['TotalInsurances'] = Insurances['Insurance_Life'] + Insurances['Insurance_Home'] \
Out[ ]:
client_id Insurance_Life Insurance_Home Insurance_Accidents Insurance_Mobile Insurance_ATM Insurance_Unemployment TotalInsurances Porcent_Total_Insurances
0 1030 0 0 0 0 1 0 1 0.2
1 1094 0 0 0 0 0 0 0 0.0
2 1553 0 0 0 0 0 0 0 0.0
3 1590 1 0 1 0 0 0 2 0.4
4 1948 0 1 0 0 1 0 2 0.4
... ... ... ... ... ... ... ... ... ...
23186 7344944 0 0 0 0 0 0 0 0.0
23187 7345011 0 0 0 0 1 0 1 0.2
23188 7345026 0 0 0 0 0 0 0 0.0
23189 7345029 0 0 0 0 0 0 0 0.0
23190 7345492 0 0 0 0 0 0 0 0.0

23191 rows × 9 columns

Create ABT¶

In [ ]:
ABT = identity_features.merge(aggregateFeatures, how='inner', on='client_id')\
        .merge(Operations_Aggregate, how='inner', on='client_id')\
        .merge(Insurances_Aggregate, how='inner', on='client_id')
ABT.drop(columns=['Target'], inplace=True)
ABT
Out[ ]:
client_id CreditCard_Premium CreditCard_Active Loan_Active Mortgage_Active DebitCard_Active Investment_Active Sex Client_Age_grp Mobile ... Porcent_Operations_HomeBanking Porcent_Operations_Mobile Insurance_Life Insurance_Home Insurance_Accidents Insurance_Mobile Insurance_ATM Insurance_Unemployment TotalInsurances Porcent_Total_Insurances
0 5928737 0 0 0 0 1 0 1 30 1 ... 0.000000 0.0 0 0 0 0 0 0 0 0.0
1 6018047 1 1 0 0 1 0 1 60 1 ... 0.727273 0.0 0 0 0 0 0 0 0 0.0
2 5359038 1 1 1 0 1 0 1 40 1 ... 0.915663 0.0 0 0 0 0 0 0 0 0.0
3 6890812 0 0 1 0 1 0 1 40 1 ... 0.972603 0.0 1 0 0 0 0 1 2 0.4
4 115383 0 0 0 0 0 0 1 70 0 ... 0.000000 0.0 0 0 0 0 0 0 0 0.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
23186 6570413 0 1 0 0 1 0 0 30 1 ... 0.300000 0.2 0 0 0 0 0 0 0 0.0
23187 6258895 0 0 0 0 1 0 0 30 1 ... 0.000000 0.0 0 1 0 0 0 0 1 0.2
23188 6397274 0 1 0 0 0 0 1 40 0 ... 0.000000 0.0 1 0 1 0 0 0 2 0.4
23189 6007291 1 1 0 0 0 0 0 40 0 ... 0.000000 0.0 0 0 0 0 0 0 0 0.0
23190 6412619 0 0 1 0 1 0 0 60 1 ... 0.000000 0.0 1 1 0 0 1 0 3 0.6

23191 rows × 281 columns

Dimensionality Reduction¶

Minimum = Maximum¶

In [ ]:
ABT.describe()
Out[ ]:
client_id CreditCard_Premium CreditCard_Active Loan_Active Mortgage_Active DebitCard_Active Investment_Active Sex Client_Age_grp Mobile ... Porcent_Operations_HomeBanking Porcent_Operations_Mobile Insurance_Life Insurance_Home Insurance_Accidents Insurance_Mobile Insurance_ATM Insurance_Unemployment TotalInsurances Porcent_Total_Insurances
count 2.319100e+04 23191.000000 23191.000000 23191.000000 23191.000000 23191.000000 23191.000000 23191.000000 23191.000000 23191.000000 ... 23191.000000 23191.000000 23191.000000 23191.000000 23191.000000 23191.000000 23191.000000 23191.000000 23191.000000 23191.000000
mean 4.360437e+06 0.115346 0.647794 0.127981 0.022379 0.669527 0.054763 0.573541 44.405459 0.881075 ... 0.283736 0.078601 0.193437 0.145272 0.156268 0.076409 0.143978 0.077918 0.793282 0.158656
std 2.144133e+06 0.319446 0.477668 0.334075 0.147917 0.470394 0.227521 0.494573 13.453333 0.323708 ... 0.380932 0.213620 0.395001 0.352382 0.363117 0.265657 0.351075 0.268049 1.174822 0.234964
min 1.030000e+03 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 18.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
25% 2.525100e+06 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 30.000000 1.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
50% 4.991596e+06 0.000000 1.000000 0.000000 0.000000 1.000000 0.000000 1.000000 40.000000 1.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
75% 6.154208e+06 0.000000 1.000000 0.000000 0.000000 1.000000 0.000000 1.000000 50.000000 1.000000 ... 0.619048 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.200000
max 7.345492e+06 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 70.000000 1.000000 ... 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 6.000000 1.200000

8 rows × 281 columns

In [ ]:
x = pd.DataFrame(ABT.describe().T)
x.columns

delete = x[x['min']==x['max']].reset_index()

print('ABT min/max: ', ABT.shape)
ABT.drop(delete['index'], axis=1, inplace=True)
print('ABT: ', ABT.shape)
ABT min/max:  (23191, 281)
ABT:  (23191, 281)

PCA¶

In [ ]:
#to scale the data using z-score 
from sklearn.preprocessing import StandardScaler
cols = [x for x in ABT.columns if (x.startswith('CreditCard_Spending')) ]
print(cols)
scaler = StandardScaler()
data_scaled = scaler.fit_transform(ABT[cols])

type(data_scaled)
['CreditCard_Spending_1_Installment', 'CreditCard_Spending_CrossBoarder', 'CreditCard_Spending_Aut_Debits', 'CreditCard_Spending_1_Installment_sum', 'CreditCard_Spending_1_Installment_max', 'CreditCard_Spending_1_Installment_min', 'CreditCard_Spending_1_Installment_mean', 'CreditCard_Spending_1_Installment_nunique', 'CreditCard_Spending_1_Installment_count_nonzero', 'CreditCard_Spending_1_Installment_var', 'CreditCard_Spending_CrossBoarder_sum', 'CreditCard_Spending_CrossBoarder_max', 'CreditCard_Spending_CrossBoarder_min', 'CreditCard_Spending_CrossBoarder_mean', 'CreditCard_Spending_CrossBoarder_nunique', 'CreditCard_Spending_CrossBoarder_count_nonzero', 'CreditCard_Spending_CrossBoarder_var', 'CreditCard_Spending_Aut_Debits_sum', 'CreditCard_Spending_Aut_Debits_max', 'CreditCard_Spending_Aut_Debits_min', 'CreditCard_Spending_Aut_Debits_mean', 'CreditCard_Spending_Aut_Debits_nunique', 'CreditCard_Spending_Aut_Debits_count_nonzero', 'CreditCard_Spending_Aut_Debits_var']
Out[ ]:
numpy.ndarray
In [ ]:
data_scaled = pd.DataFrame(data_scaled, columns=ABT[cols].columns)

data_scaled.head(5)
Out[ ]:
CreditCard_Spending_1_Installment CreditCard_Spending_CrossBoarder CreditCard_Spending_Aut_Debits CreditCard_Spending_1_Installment_sum CreditCard_Spending_1_Installment_max CreditCard_Spending_1_Installment_min CreditCard_Spending_1_Installment_mean CreditCard_Spending_1_Installment_nunique CreditCard_Spending_1_Installment_count_nonzero CreditCard_Spending_1_Installment_var ... CreditCard_Spending_CrossBoarder_nunique CreditCard_Spending_CrossBoarder_count_nonzero CreditCard_Spending_CrossBoarder_var CreditCard_Spending_Aut_Debits_sum CreditCard_Spending_Aut_Debits_max CreditCard_Spending_Aut_Debits_min CreditCard_Spending_Aut_Debits_mean CreditCard_Spending_Aut_Debits_nunique CreditCard_Spending_Aut_Debits_count_nonzero CreditCard_Spending_Aut_Debits_var
0 -0.404539 -0.174669 -0.473121 -0.482228 -0.595466 -0.206982 -0.482228 -0.857834 -0.857969 -0.369477 ... -0.325674 -0.308900 -0.248440 -0.483256 -0.509510 -0.427386 -0.483256 -0.717120 -0.909830 -0.184095
1 -0.121738 -0.174669 4.916441 1.349612 2.198174 -0.206982 1.349612 1.619630 1.156892 1.813086 ... -0.325674 -0.308900 -0.248440 5.305475 4.312784 6.166697 5.305475 -0.181371 1.155444 -0.131284
2 0.115814 1.574087 -0.344315 -0.178073 -0.230906 -0.206982 -0.178073 1.619630 1.156892 -0.333223 ... 1.156702 0.664959 -0.023213 -0.366725 -0.394261 -0.427386 -0.366725 -0.181371 0.811232 -0.176727
3 -0.404539 -0.174669 -0.473121 -0.482228 -0.595466 -0.206982 -0.482228 -0.857834 -0.857969 -0.369477 ... -0.325674 -0.308900 -0.248440 -0.483256 -0.509510 -0.427386 -0.483256 -0.717120 -0.909830 -0.184095
4 -0.404539 -0.174669 -0.473121 -0.482228 -0.595466 -0.206982 -0.482228 -0.857834 -0.857969 -0.369477 ... -0.325674 -0.308900 -0.248440 -0.483256 -0.509510 -0.427386 -0.483256 -0.717120 -0.909830 -0.184095

5 rows × 24 columns

In [ ]:
data_scaled.describe()
Out[ ]:
CreditCard_Spending_1_Installment CreditCard_Spending_CrossBoarder CreditCard_Spending_Aut_Debits CreditCard_Spending_1_Installment_sum CreditCard_Spending_1_Installment_max CreditCard_Spending_1_Installment_min CreditCard_Spending_1_Installment_mean CreditCard_Spending_1_Installment_nunique CreditCard_Spending_1_Installment_count_nonzero CreditCard_Spending_1_Installment_var ... CreditCard_Spending_CrossBoarder_nunique CreditCard_Spending_CrossBoarder_count_nonzero CreditCard_Spending_CrossBoarder_var CreditCard_Spending_Aut_Debits_sum CreditCard_Spending_Aut_Debits_max CreditCard_Spending_Aut_Debits_min CreditCard_Spending_Aut_Debits_mean CreditCard_Spending_Aut_Debits_nunique CreditCard_Spending_Aut_Debits_count_nonzero CreditCard_Spending_Aut_Debits_var
count 2.319100e+04 2.319100e+04 2.319100e+04 2.319100e+04 2.319100e+04 2.319100e+04 2.319100e+04 2.319100e+04 2.319100e+04 2.319100e+04 ... 2.319100e+04 2.319100e+04 2.319100e+04 2.319100e+04 2.319100e+04 2.319100e+04 2.319100e+04 2.319100e+04 2.319100e+04 2.319100e+04
mean 5.361777e-18 -5.943913e-17 5.469013e-17 5.208584e-17 2.083433e-17 2.451098e-17 4.166867e-17 -6.556688e-17 1.715769e-17 -1.945559e-17 ... -1.862835e-16 4.657087e-17 -6.403494e-17 4.274103e-17 1.363423e-17 -8.885231e-18 1.838324e-17 -1.286827e-17 -2.849402e-17 2.757485e-18
std 1.000022e+00 1.000022e+00 1.000022e+00 1.000022e+00 1.000022e+00 1.000022e+00 1.000022e+00 1.000022e+00 1.000022e+00 1.000022e+00 ... 1.000022e+00 1.000022e+00 1.000022e+00 1.000022e+00 1.000022e+00 1.000022e+00 1.000022e+00 1.000022e+00 1.000022e+00 1.000022e+00
min -5.970550e+00 -1.746695e-01 -4.555431e+00 -7.962067e+00 -4.495013e+00 -1.319936e+01 -7.962067e+00 -8.578344e-01 -8.579690e-01 -3.694769e-01 ... -3.256741e-01 -3.089002e-01 -2.484396e-01 -1.050400e+00 -5.095100e-01 -5.763493e+00 -1.050400e+00 -7.171202e-01 -9.098305e-01 -1.840948e-01
25% -4.045387e-01 -1.746695e-01 -4.731206e-01 -4.822276e-01 -5.954658e-01 -2.069816e-01 -4.822276e-01 -8.578344e-01 -8.579690e-01 -3.694769e-01 ... -3.256741e-01 -3.089002e-01 -2.484396e-01 -4.832563e-01 -5.095100e-01 -4.273857e-01 -4.832563e-01 -7.171202e-01 -9.098305e-01 -1.840948e-01
50% -4.045387e-01 -1.746695e-01 -4.731206e-01 -4.631958e-01 -5.520753e-01 -2.069816e-01 -4.631958e-01 -3.623415e-01 -4.549967e-01 -3.693950e-01 ... -3.256741e-01 -3.089002e-01 -2.484396e-01 -4.832563e-01 -5.095100e-01 -4.273857e-01 -4.832563e-01 -7.171202e-01 -9.098305e-01 -1.840948e-01
75% -1.798650e-01 -1.746695e-01 -4.200653e-02 1.850608e-02 1.678342e-01 -2.069816e-01 1.850608e-02 1.124137e+00 1.156892e+00 -2.023504e-01 ... -3.256741e-01 -3.089002e-01 -2.484396e-01 -7.949366e-03 3.423440e-02 -8.840142e-02 -7.949366e-03 8.901281e-01 1.155444e+00 -1.696582e-01
max 5.161473e+00 7.306930e+00 4.916441e+00 6.997612e+00 3.304082e+00 1.278540e+01 6.997612e+00 1.619630e+00 1.559865e+00 1.394463e+01 ... 7.086209e+00 5.534253e+00 7.171873e+00 5.367874e+00 4.312784e+00 6.617469e+00 5.367874e+00 1.961627e+00 1.155444e+00 2.346509e+01

8 rows × 24 columns

In [ ]:
#Importing PCA and TSNE
from sklearn.decomposition import PCA

#Defining the number of principal components to generate 
n = data_scaled.shape[1]

#Finding principal components for the data
pca1 = PCA(n_components=n, random_state=1)
data_pca = pd.DataFrame(pca1.fit_transform(data_scaled))

#The percentage of variance explained by each principal component
exp_var1 = pca1.explained_variance_ratio_
In [ ]:
pca1.explained_variance_ratio_.cumsum()
Out[ ]:
array([0.38312432, 0.56622212, 0.6940415 , 0.74982331, 0.79421735,
       0.83520993, 0.87372543, 0.90582188, 0.93159319, 0.95309944,
       0.96551444, 0.97468403, 0.98155217, 0.98748567, 0.99094997,
       0.99363639, 0.99602674, 0.99733536, 0.99841912, 0.99946476,
       1.        , 1.        , 1.        , 1.        ])
In [ ]:
import matplotlib.pyplot as plt
import seaborn as sns

# visulaize the explained variance by individual components
plt.figure(figsize = (10,10))
plt.plot(range(1,n+1), pca1.explained_variance_ratio_.cumsum(), marker = 'o', linestyle = '--')
plt.title("Explained Variances by Components")
plt.xlabel("Number of Components")
plt.ylabel("Cumulative Explained Variance")
Out[ ]:
Text(0, 0.5, 'Cumulative Explained Variance')
No description has been provided for this image
In [ ]:
# find the least number of components that can explain more than 80% variance
sum = 0
for ix, i in enumerate(exp_var1):
  sum = sum + i
  if(sum>0.80):
    print("Number of PCs that explain at least 80% variance: ", ix+1)
    break
Number of PCs that explain at least 80% variance:  6
In [ ]:
# find the least number of components that can explain more than 70% variance
sum = 0
for ix, i in enumerate(exp_var1):
  sum = sum + i
  if(sum>0.70):
    print("Number of PCs that explain at least 70% variance: ", ix+1)
    break
Number of PCs that explain at least 70% variance:  4
In [ ]:
cols = ['PC1', 'PC2', 'PC3', 'PC4']

pc1 = pd.DataFrame(np.round(pca1.components_.T[:, 0:4],2), index=data_scaled.columns, columns=cols)
In [ ]:
def color_high(val):
    if val <= -0.25: # you can decide any value as per your understanding
        return 'background: pink'
    elif val >= 0.25:
        return 'background: skyblue'   
    
pc1.style.applymap(color_high)
Out[ ]:
  PC1 PC2 PC3 PC4
CreditCard_Spending_1_Installment 0.210000 0.000000 -0.300000 0.180000
CreditCard_Spending_CrossBoarder 0.140000 0.240000 0.130000 0.050000
CreditCard_Spending_Aut_Debits 0.240000 -0.250000 0.200000 0.110000
CreditCard_Spending_1_Installment_sum 0.250000 0.010000 -0.330000 0.180000
CreditCard_Spending_1_Installment_max 0.250000 0.020000 -0.310000 -0.000000
CreditCard_Spending_1_Installment_min 0.160000 -0.000000 -0.260000 0.380000
CreditCard_Spending_1_Installment_mean 0.250000 0.010000 -0.330000 0.180000
CreditCard_Spending_1_Installment_nunique 0.220000 -0.030000 -0.190000 -0.470000
CreditCard_Spending_1_Installment_count_nonzero 0.230000 -0.040000 -0.190000 -0.430000
CreditCard_Spending_1_Installment_var 0.180000 0.040000 -0.230000 -0.010000
CreditCard_Spending_CrossBoarder_sum 0.190000 0.330000 0.180000 0.040000
CreditCard_Spending_CrossBoarder_max 0.200000 0.310000 0.160000 0.020000
CreditCard_Spending_CrossBoarder_min 0.060000 0.130000 0.090000 0.020000
CreditCard_Spending_CrossBoarder_mean 0.190000 0.330000 0.180000 0.040000
CreditCard_Spending_CrossBoarder_nunique 0.190000 0.250000 0.110000 -0.090000
CreditCard_Spending_CrossBoarder_count_nonzero 0.180000 0.240000 0.120000 -0.080000
CreditCard_Spending_CrossBoarder_var 0.180000 0.300000 0.150000 0.050000
CreditCard_Spending_Aut_Debits_sum 0.240000 -0.260000 0.210000 0.120000
CreditCard_Spending_Aut_Debits_max 0.240000 -0.260000 0.210000 0.100000
CreditCard_Spending_Aut_Debits_min 0.220000 -0.240000 0.200000 0.130000
CreditCard_Spending_Aut_Debits_mean 0.240000 -0.260000 0.210000 0.120000
CreditCard_Spending_Aut_Debits_nunique 0.210000 -0.180000 0.090000 -0.300000
CreditCard_Spending_Aut_Debits_count_nonzero 0.200000 -0.150000 0.050000 -0.400000
CreditCard_Spending_Aut_Debits_var 0.090000 -0.120000 0.100000 0.130000

Correlated Features¶

In [ ]:
ABT.corr()
Out[ ]:
client_id CreditCard_Premium CreditCard_Active Loan_Active Mortgage_Active DebitCard_Active Investment_Active Sex Client_Age_grp Mobile ... Porcent_Operations_HomeBanking Porcent_Operations_Mobile Insurance_Life Insurance_Home Insurance_Accidents Insurance_Mobile Insurance_ATM Insurance_Unemployment TotalInsurances Porcent_Total_Insurances
client_id 1.000000 0.044398 -0.148117 0.008420 -0.125800 0.149141 -0.035678 -0.058477 -0.302570 0.038438 ... 0.003432 0.086509 -0.006130 -0.075573 -0.030472 -0.036124 0.013505 0.012061 -0.035528 -0.035528
CreditCard_Premium 0.044398 1.000000 0.254667 -0.007414 -0.015391 -0.099862 -0.035889 -0.026809 0.020644 0.003805 ... 0.033540 0.028793 0.029580 0.021988 0.042374 0.079069 -0.047733 -0.007771 0.031480 0.031480
CreditCard_Active -0.148117 0.254667 1.000000 0.038196 -0.052002 -0.285630 -0.038765 -0.017215 0.172011 0.012443 ... 0.111196 0.078426 0.155410 0.158473 0.174625 0.191357 -0.019796 0.013618 0.194221 0.194221
Loan_Active 0.008420 -0.007414 0.038196 1.000000 -0.037892 0.223872 -0.040017 -0.011554 0.063915 0.065383 ... 0.005896 0.058987 0.270208 0.158914 0.273786 0.067158 0.217906 0.708234 0.465033 0.465033
Mortgage_Active -0.125800 -0.015391 -0.052002 -0.037892 1.000000 0.002179 0.019961 0.034384 0.002331 -0.029069 ... 0.056854 0.005701 0.009304 0.045174 -0.003294 -0.021570 0.045069 -0.035281 0.016201 0.016201
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
Insurance_Mobile -0.036124 0.079069 0.191357 0.067158 -0.021570 -0.058112 -0.019291 0.011712 0.132112 0.040485 ... -0.011414 0.007520 0.175155 0.250395 0.230260 1.000000 0.021671 0.060514 0.451573 0.451573
Insurance_ATM 0.013505 -0.047733 -0.019796 0.217906 0.045069 0.243219 0.037870 0.012652 -0.054322 0.058469 ... 0.128352 0.097599 0.233253 0.149513 0.146543 0.021671 1.000000 0.176342 0.512532 0.512532
Insurance_Unemployment 0.012061 -0.007771 0.013618 0.708234 -0.035281 0.168320 -0.038151 -0.000777 -0.023123 0.051635 ... 0.007100 0.051294 0.271025 0.107511 0.298883 0.060514 0.176342 1.000000 0.510293 0.510293
TotalInsurances -0.035528 0.031480 0.194221 0.465033 0.016201 0.124437 -0.032341 0.032548 0.020222 0.085595 ... 0.088078 0.091103 0.710251 0.589711 0.687889 0.451573 0.512532 0.510293 1.000000 1.000000
Porcent_Total_Insurances -0.035528 0.031480 0.194221 0.465033 0.016201 0.124437 -0.032341 0.032548 0.020222 0.085595 ... 0.088078 0.091103 0.710251 0.589711 0.687889 0.451573 0.512532 0.510293 1.000000 1.000000

281 rows × 281 columns

In [ ]:
# if Inplace is False, return a copy, default drop(inplace=False)
corr_Matrix = ABT.drop(['client_id','TGT'], axis=1).corr().abs()
corr_Matrix
Out[ ]:
CreditCard_Premium CreditCard_Active Loan_Active Mortgage_Active DebitCard_Active Investment_Active Sex Client_Age_grp Mobile Email ... Porcent_Operations_HomeBanking Porcent_Operations_Mobile Insurance_Life Insurance_Home Insurance_Accidents Insurance_Mobile Insurance_ATM Insurance_Unemployment TotalInsurances Porcent_Total_Insurances
CreditCard_Premium 1.000000 0.254667 0.007414 0.015391 0.099862 0.035889 0.026809 0.020644 0.003805 0.088081 ... 0.033540 0.028793 0.029580 0.021988 0.042374 0.079069 0.047733 0.007771 0.031480 0.031480
CreditCard_Active 0.254667 1.000000 0.038196 0.052002 0.285630 0.038765 0.017215 0.172011 0.012443 0.089459 ... 0.111196 0.078426 0.155410 0.158473 0.174625 0.191357 0.019796 0.013618 0.194221 0.194221
Loan_Active 0.007414 0.038196 1.000000 0.037892 0.223872 0.040017 0.011554 0.063915 0.065383 0.012032 ... 0.005896 0.058987 0.270208 0.158914 0.273786 0.067158 0.217906 0.708234 0.465033 0.465033
Mortgage_Active 0.015391 0.052002 0.037892 1.000000 0.002179 0.019961 0.034384 0.002331 0.029069 0.007549 ... 0.056854 0.005701 0.009304 0.045174 0.003294 0.021570 0.045069 0.035281 0.016201 0.016201
DebitCard_Active 0.099862 0.285630 0.223872 0.002179 1.000000 0.148153 0.022313 0.141464 0.118533 0.089266 ... 0.178398 0.164068 0.051871 0.011021 0.018590 0.058112 0.243219 0.168320 0.124437 0.124437
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
Insurance_Mobile 0.079069 0.191357 0.067158 0.021570 0.058112 0.019291 0.011712 0.132112 0.040485 0.019767 ... 0.011414 0.007520 0.175155 0.250395 0.230260 1.000000 0.021671 0.060514 0.451573 0.451573
Insurance_ATM 0.047733 0.019796 0.217906 0.045069 0.243219 0.037870 0.012652 0.054322 0.058469 0.060119 ... 0.128352 0.097599 0.233253 0.149513 0.146543 0.021671 1.000000 0.176342 0.512532 0.512532
Insurance_Unemployment 0.007771 0.013618 0.708234 0.035281 0.168320 0.038151 0.000777 0.023123 0.051635 0.012599 ... 0.007100 0.051294 0.271025 0.107511 0.298883 0.060514 0.176342 1.000000 0.510293 0.510293
TotalInsurances 0.031480 0.194221 0.465033 0.016201 0.124437 0.032341 0.032548 0.020222 0.085595 0.060886 ... 0.088078 0.091103 0.710251 0.589711 0.687889 0.451573 0.512532 0.510293 1.000000 1.000000
Porcent_Total_Insurances 0.031480 0.194221 0.465033 0.016201 0.124437 0.032341 0.032548 0.020222 0.085595 0.060886 ... 0.088078 0.091103 0.710251 0.589711 0.687889 0.451573 0.512532 0.510293 1.000000 1.000000

279 rows × 279 columns

In [ ]:
sns.heatmap(corr_Matrix, annot=True)
Out[ ]:
<Axes: >
No description has been provided for this image
In [ ]:
upper = corr_Matrix.where(np.triu(np.ones(corr_Matrix.shape),k=1).astype(bool)).fillna(0)

## features to drop, because correlation is up to 0.8
to_drop = [column for column in upper.columns if any(upper[column] > 0.8)]
#len(to_drop)

to_drop
Out[ ]:
['CreditCard_Product',
 'SavingAccount_Active_ARG',
 'SavingAccount_Days_with_Debits',
 'SavingAccount_Transactions_Transactions',
 'SavingAccount_Credits_Transactions',
 'SavingAccount_Debits_Transactions',
 'SavingAccount_Salary_Payment_Amount',
 'SavingAccount_Transfer_In_Amount',
 'SavingAccount_ATM_Extraction_Amount',
 'SavingAccount_DebitCard_Spend_Amount',
 'SavingAccount_Credits_Amounts',
 'SavingAccount_Debits_Amounts',
 'CreditCard_Spending_1_Installment',
 'CreditCard_Spending_CrossBoarder',
 'SavingAccount_Days_with_Credits_sum',
 'SavingAccount_Days_with_Credits_max',
 'SavingAccount_Days_with_Credits_min',
 'SavingAccount_Days_with_Credits_mean',
 'SavingAccount_Days_with_Credits_nunique',
 'SavingAccount_Days_with_Debits_sum',
 'SavingAccount_Days_with_Debits_max',
 'SavingAccount_Days_with_Debits_min',
 'SavingAccount_Days_with_Debits_mean',
 'SavingAccount_Days_with_Debits_count_nonzero',
 'SavingAccount_Salary_Payment_Transactions_sum',
 'SavingAccount_Salary_Payment_Transactions_max',
 'SavingAccount_Salary_Payment_Transactions_min',
 'SavingAccount_Salary_Payment_Transactions_mean',
 'SavingAccount_Salary_Payment_Transactions_nunique',
 'SavingAccount_Salary_Payment_Transactions_count_nonzero',
 'SavingAccount_Salary_Payment_Transactions_var',
 'SavingAccount_Transfer_In_Transactions_sum',
 'SavingAccount_Transfer_In_Transactions_max',
 'SavingAccount_Transfer_In_Transactions_mean',
 'SavingAccount_Transfer_In_Transactions_nunique',
 'SavingAccount_Transfer_In_Transactions_count_nonzero',
 'SavingAccount_Transfer_In_Transactions_var',
 'SavingAccount_ATM_Extraction_Transactions_sum',
 'SavingAccount_ATM_Extraction_Transactions_max',
 'SavingAccount_ATM_Extraction_Transactions_min',
 'SavingAccount_ATM_Extraction_Transactions_mean',
 'SavingAccount_ATM_Extraction_Transactions_count_nonzero',
 'SavingAccount_ATM_Extraction_Transactions_var',
 'SavingAccount_CreditCard_Payment_Transactions_sum',
 'SavingAccount_CreditCard_Payment_Transactions_max',
 'SavingAccount_CreditCard_Payment_Transactions_min',
 'SavingAccount_CreditCard_Payment_Transactions_mean',
 'SavingAccount_CreditCard_Payment_Transactions_nunique',
 'SavingAccount_CreditCard_Payment_Transactions_count_nonzero',
 'SavingAccount_CreditCard_Payment_Transactions_var',
 'SavingAccount_Transfer_Out_Transactions_sum',
 'SavingAccount_Transfer_Out_Transactions_max',
 'SavingAccount_Transfer_Out_Transactions_min',
 'SavingAccount_Transfer_Out_Transactions_mean',
 'SavingAccount_Transfer_Out_Transactions_nunique',
 'SavingAccount_Transfer_Out_Transactions_count_nonzero',
 'SavingAccount_Transfer_Out_Transactions_var',
 'SavingAccount_DebitCard_Spend_Transactions_sum',
 'SavingAccount_DebitCard_Spend_Transactions_max',
 'SavingAccount_DebitCard_Spend_Transactions_min',
 'SavingAccount_DebitCard_Spend_Transactions_mean',
 'SavingAccount_DebitCard_Spend_Transactions_nunique',
 'SavingAccount_DebitCard_Spend_Transactions_count_nonzero',
 'SavingAccount_DebitCard_Spend_Transactions_var',
 'SavingAccount_Transactions_Transactions_sum',
 'SavingAccount_Transactions_Transactions_max',
 'SavingAccount_Transactions_Transactions_min',
 'SavingAccount_Transactions_Transactions_mean',
 'SavingAccount_Transactions_Transactions_nunique',
 'SavingAccount_Transactions_Transactions_count_nonzero',
 'SavingAccount_Credits_Transactions_sum',
 'SavingAccount_Credits_Transactions_max',
 'SavingAccount_Credits_Transactions_min',
 'SavingAccount_Credits_Transactions_mean',
 'SavingAccount_Credits_Transactions_nunique',
 'SavingAccount_Credits_Transactions_count_nonzero',
 'SavingAccount_Debits_Transactions_sum',
 'SavingAccount_Debits_Transactions_max',
 'SavingAccount_Debits_Transactions_min',
 'SavingAccount_Debits_Transactions_mean',
 'SavingAccount_Debits_Transactions_nunique',
 'SavingAccount_Debits_Transactions_count_nonzero',
 'SavingAccount_Debits_Transactions_var',
 'SavingAccount_Salary_Payment_Amount_sum',
 'SavingAccount_Salary_Payment_Amount_max',
 'SavingAccount_Salary_Payment_Amount_min',
 'SavingAccount_Salary_Payment_Amount_mean',
 'SavingAccount_Salary_Payment_Amount_count_nonzero',
 'SavingAccount_Transfer_In_Amount_sum',
 'SavingAccount_Transfer_In_Amount_max',
 'SavingAccount_Transfer_In_Amount_min',
 'SavingAccount_Transfer_In_Amount_mean',
 'SavingAccount_Transfer_In_Amount_nunique',
 'SavingAccount_Transfer_In_Amount_count_nonzero',
 'SavingAccount_ATM_Extraction_Amount_sum',
 'SavingAccount_ATM_Extraction_Amount_max',
 'SavingAccount_ATM_Extraction_Amount_min',
 'SavingAccount_ATM_Extraction_Amount_mean',
 'SavingAccount_ATM_Extraction_Amount_nunique',
 'SavingAccount_ATM_Extraction_Amount_count_nonzero',
 'SavingAccount_ATM_Extraction_Amount_var',
 'SavingAccount_CreditCard_Payment_Amount_sum',
 'SavingAccount_CreditCard_Payment_Amount_max',
 'SavingAccount_CreditCard_Payment_Amount_mean',
 'SavingAccount_CreditCard_Payment_Amount_nunique',
 'SavingAccount_CreditCard_Payment_Amount_count_nonzero',
 'SavingAccount_CreditCard_Payment_Amount_var',
 'SavingAccount_Transfer_Out_Amount_max',
 'SavingAccount_Transfer_Out_Amount_mean',
 'SavingAccount_Transfer_Out_Amount_count_nonzero',
 'SavingAccount_Transfer_Out_Amount_var',
 'SavingAccount_DebitCard_Spend_Amount_sum',
 'SavingAccount_DebitCard_Spend_Amount_max',
 'SavingAccount_DebitCard_Spend_Amount_min',
 'SavingAccount_DebitCard_Spend_Amount_mean',
 'SavingAccount_DebitCard_Spend_Amount_nunique',
 'SavingAccount_DebitCard_Spend_Amount_count_nonzero',
 'SavingAccount_Total_Amount_sum',
 'SavingAccount_Total_Amount_max',
 'SavingAccount_Total_Amount_min',
 'SavingAccount_Total_Amount_mean',
 'SavingAccount_Total_Amount_nunique',
 'SavingAccount_Total_Amount_count_nonzero',
 'SavingAccount_Credits_Amounts_sum',
 'SavingAccount_Credits_Amounts_max',
 'SavingAccount_Credits_Amounts_min',
 'SavingAccount_Credits_Amounts_mean',
 'SavingAccount_Credits_Amounts_nunique',
 'SavingAccount_Credits_Amounts_count_nonzero',
 'SavingAccount_Credits_Amounts_var',
 'SavingAccount_Debits_Amounts_sum',
 'SavingAccount_Debits_Amounts_max',
 'SavingAccount_Debits_Amounts_min',
 'SavingAccount_Debits_Amounts_mean',
 'SavingAccount_Debits_Amounts_nunique',
 'SavingAccount_Debits_Amounts_count_nonzero',
 'SavingAccount_Debits_Amounts_var',
 'Operations_HomeBanking_sum',
 'Operations_HomeBanking_max',
 'Operations_HomeBanking_min',
 'Operations_HomeBanking_mean',
 'Operations_HomeBanking_nunique',
 'Operations_HomeBanking_count_nonzero',
 'Operations_Mobile_sum',
 'Operations_Mobile_max',
 'Operations_Mobile_min',
 'Operations_Mobile_mean',
 'Operations_Mobile_nunique',
 'Operations_Mobile_count_nonzero',
 'CreditCard_Balance_ARG_sum',
 'CreditCard_Balance_ARG_max',
 'CreditCard_Balance_ARG_min',
 'CreditCard_Balance_ARG_mean',
 'CreditCard_Balance_ARG_nunique',
 'CreditCard_Balance_ARG_count_nonzero',
 'CreditCard_Balance_DOLLAR_max',
 'CreditCard_Balance_DOLLAR_mean',
 'CreditCard_Balance_DOLLAR_count_nonzero',
 'CreditCard_Balance_DOLLAR_var',
 'CreditCard_Total_Limit_sum',
 'CreditCard_Total_Limit_max',
 'CreditCard_Total_Limit_min',
 'CreditCard_Total_Limit_mean',
 'CreditCard_Total_Limit_count_nonzero',
 'CreditCard_Total_Spending_sum',
 'CreditCard_Total_Spending_max',
 'CreditCard_Total_Spending_min',
 'CreditCard_Total_Spending_mean',
 'CreditCard_Total_Spending_nunique',
 'CreditCard_Total_Spending_count_nonzero',
 'CreditCard_Spending_1_Installment_sum',
 'CreditCard_Spending_1_Installment_max',
 'CreditCard_Spending_1_Installment_mean',
 'CreditCard_Spending_1_Installment_count_nonzero',
 'CreditCard_Spending_1_Installment_var',
 'CreditCard_Spending_CrossBoarder_sum',
 'CreditCard_Spending_CrossBoarder_max',
 'CreditCard_Spending_CrossBoarder_mean',
 'CreditCard_Spending_CrossBoarder_nunique',
 'CreditCard_Spending_CrossBoarder_count_nonzero',
 'CreditCard_Spending_CrossBoarder_var',
 'CreditCard_Spending_Aut_Debits_sum',
 'CreditCard_Spending_Aut_Debits_max',
 'CreditCard_Spending_Aut_Debits_min',
 'CreditCard_Spending_Aut_Debits_mean',
 'CreditCard_Revolving_sum',
 'CreditCard_Revolving_max',
 'CreditCard_Revolving_mean',
 'CreditCard_Revolving_count_nonzero',
 'Operations_HomeBanking_y',
 'Operations_Mobile_y',
 'Porcent_Operations_HomeBanking',
 'Porcent_Operations_Mobile',
 'Porcent_Total_Insurances']
In [ ]:
#drop features
ABT.drop(to_drop,axis=1, inplace=True)
ABT.shape
Out[ ]:
(23191, 87)
In [ ]:
ABT.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23191 entries, 0 to 23190
Data columns (total 87 columns):
 #   Column                                             Non-Null Count  Dtype  
---  ------                                             --------------  -----  
 0   client_id                                          23191 non-null  int32  
 1   CreditCard_Premium                                 23191 non-null  int32  
 2   CreditCard_Active                                  23191 non-null  int32  
 3   Loan_Active                                        23191 non-null  int32  
 4   Mortgage_Active                                    23191 non-null  int32  
 5   DebitCard_Active                                   23191 non-null  int32  
 6   Investment_Active                                  23191 non-null  int32  
 7   Sex                                                23191 non-null  int32  
 8   Client_Age_grp                                     23191 non-null  int64  
 9   Mobile                                             23191 non-null  int32  
 10  Email                                              23191 non-null  int32  
 11  Region_AMBA Resto                                  23191 non-null  int32  
 12  Region_BUENOS AIRES                                23191 non-null  int32  
 13  Region_CABA Centro/Norte                           23191 non-null  int32  
 14  Region_REGION CENTRO                               23191 non-null  int32  
 15  Region_REGION CUYO                                 23191 non-null  int32  
 16  Region_REGION NORTE GRANDE ARGENTINO               23191 non-null  int32  
 17  Region_REGION PATAGONICA                           23191 non-null  int32  
 18  SavingAccount_Active_ARG_Salary                    23191 non-null  int32  
 19  SavingAccount_Active_DOLLAR                        23191 non-null  int32  
 20  SavingAccount_Days_with_Credits                    23191 non-null  float64
 21  SavingAccount_Salary_Payment_Transactions          23191 non-null  float64
 22  SavingAccount_Transfer_In_Transactions             23191 non-null  float64
 23  SavingAccount_ATM_Extraction_Transactions          23191 non-null  float64
 24  SavingAccount_CreditCard_Payment_Transactions      23191 non-null  float64
 25  SavingAccount_Transfer_Out_Transactions            23191 non-null  float64
 26  SavingAccount_DebitCard_Spend_Transactions         23191 non-null  float64
 27  SavingAccount_CreditCard_Payment_Amount            23191 non-null  float64
 28  SavingAccount_Transfer_Out_Amount                  23191 non-null  float64
 29  SavingAccount_Total_Amount                         23191 non-null  float64
 30  Operations_HomeBanking_x                           23191 non-null  float64
 31  Operations_Mobile_x                                23191 non-null  float64
 32  CreditCard_Balance_ARG                             23191 non-null  float64
 33  CreditCard_Balance_DOLLAR                          23191 non-null  float64
 34  CreditCard_Total_Limit                             23191 non-null  float64
 35  CreditCard_Total_Spending                          23191 non-null  float64
 36  CreditCard_Spending_Aut_Debits                     23191 non-null  float64
 37  CreditCard_Revolving                               23191 non-null  float64
 38  TGT                                                23191 non-null  float64
 39  SavingAccount_Days_with_Credits_count_nonzero      23191 non-null  int64  
 40  SavingAccount_Days_with_Credits_var                23191 non-null  float64
 41  SavingAccount_Days_with_Debits_nunique             23191 non-null  int64  
 42  SavingAccount_Days_with_Debits_var                 23191 non-null  float64
 43  SavingAccount_Transfer_In_Transactions_min         23191 non-null  float64
 44  SavingAccount_ATM_Extraction_Transactions_nunique  23191 non-null  int64  
 45  SavingAccount_Transactions_Transactions_var        23191 non-null  float64
 46  SavingAccount_Credits_Transactions_var             23191 non-null  float64
 47  SavingAccount_Salary_Payment_Amount_nunique        23191 non-null  int64  
 48  SavingAccount_Salary_Payment_Amount_var            23191 non-null  float64
 49  SavingAccount_Transfer_In_Amount_var               23191 non-null  float64
 50  SavingAccount_CreditCard_Payment_Amount_min        23191 non-null  float64
 51  SavingAccount_Transfer_Out_Amount_sum              23191 non-null  float64
 52  SavingAccount_Transfer_Out_Amount_min              23191 non-null  float64
 53  SavingAccount_Transfer_Out_Amount_nunique          23191 non-null  int64  
 54  SavingAccount_DebitCard_Spend_Amount_var           23191 non-null  float64
 55  SavingAccount_Total_Amount_var                     23191 non-null  float64
 56  Operations_HomeBanking_var                         23191 non-null  float64
 57  Operations_Mobile_var                              23191 non-null  float64
 58  CreditCard_Balance_ARG_var                         23191 non-null  float64
 59  CreditCard_Balance_DOLLAR_sum                      23191 non-null  float64
 60  CreditCard_Balance_DOLLAR_min                      23191 non-null  float64
 61  CreditCard_Balance_DOLLAR_nunique                  23191 non-null  int64  
 62  CreditCard_Total_Limit_nunique                     23191 non-null  int64  
 63  CreditCard_Total_Limit_var                         23191 non-null  float64
 64  CreditCard_Total_Spending_var                      23191 non-null  float64
 65  CreditCard_Spending_1_Installment_min              23191 non-null  float64
 66  CreditCard_Spending_1_Installment_nunique          23191 non-null  int64  
 67  CreditCard_Spending_CrossBoarder_min               23191 non-null  float64
 68  CreditCard_Spending_Aut_Debits_nunique             23191 non-null  int64  
 69  CreditCard_Spending_Aut_Debits_count_nonzero       23191 non-null  int64  
 70  CreditCard_Spending_Aut_Debits_var                 23191 non-null  float64
 71  CreditCard_Revolving_min                           23191 non-null  float64
 72  CreditCard_Revolving_nunique                       23191 non-null  int64  
 73  CreditCard_Revolving_var                           23191 non-null  float64
 74  Operations_Bank                                    23191 non-null  float64
 75  Operations_Terminal                                23191 non-null  float64
 76  Operations_Ivr                                     23191 non-null  float64
 77  Operations_Telemarketer                            23191 non-null  float64
 78  Operations_ATM                                     23191 non-null  float64
 79  TotalOperations                                    23191 non-null  float64
 80  Insurance_Life                                     23191 non-null  int32  
 81  Insurance_Home                                     23191 non-null  int32  
 82  Insurance_Accidents                                23191 non-null  int32  
 83  Insurance_Mobile                                   23191 non-null  int32  
 84  Insurance_ATM                                      23191 non-null  int32  
 85  Insurance_Unemployment                             23191 non-null  int32  
 86  TotalInsurances                                    23191 non-null  int32  
dtypes: float64(49), int32(26), int64(12)
memory usage: 13.1 MB
In [ ]:
ABT
Out[ ]:
client_id CreditCard_Premium CreditCard_Active Loan_Active Mortgage_Active DebitCard_Active Investment_Active Sex Client_Age_grp Mobile ... Operations_Telemarketer Operations_ATM TotalOperations Insurance_Life Insurance_Home Insurance_Accidents Insurance_Mobile Insurance_ATM Insurance_Unemployment TotalInsurances
0 5928737 0 0 0 0 1 0 1 30 1 ... 0.0 0.0 0.0 0 0 0 0 0 0 0
1 6018047 1 1 0 0 1 0 1 60 1 ... 0.0 0.0 11.0 0 0 0 0 0 0 0
2 5359038 1 1 1 0 1 0 1 40 1 ... 0.0 7.0 83.0 0 0 0 0 0 0 0
3 6890812 0 0 1 0 1 0 1 40 1 ... 1.0 0.0 73.0 1 0 0 0 0 1 2
4 115383 0 0 0 0 0 0 1 70 0 ... 0.0 0.0 0.0 0 0 0 0 0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
23186 6570413 0 1 0 0 1 0 0 30 1 ... 0.0 0.0 10.0 0 0 0 0 0 0 0
23187 6258895 0 0 0 0 1 0 0 30 1 ... 0.0 0.0 0.0 0 1 0 0 0 0 1
23188 6397274 0 1 0 0 0 0 1 40 0 ... 0.0 0.0 0.0 1 0 1 0 0 0 2
23189 6007291 1 1 0 0 0 0 0 40 0 ... 0.0 0.0 0.0 0 0 0 0 0 0 0
23190 6412619 0 0 1 0 1 0 0 60 1 ... 0.0 0.0 6.0 1 1 0 0 1 0 3

23191 rows × 87 columns

Feature selection¶

In [ ]:
names = [x for x in ABT.columns if (x != 'client_id') & (x != 'TGT') ]

scaler = StandardScaler(copy=True)
scaler.fit(ABT[names]) # Entrena
scaled_est = scaler.transform(ABT[names]) # Standariza el total de la base
scaled_est = pd.DataFrame(scaled_est, columns=names, index=ABT.index)

ABT.drop(names, axis=1, inplace = True)
ABT = pd.concat((ABT, scaled_est), axis=1, sort=False)

ABT.head(5)
Out[ ]:
client_id TGT CreditCard_Premium CreditCard_Active Loan_Active Mortgage_Active DebitCard_Active Investment_Active Sex Client_Age_grp ... Operations_Telemarketer Operations_ATM TotalOperations Insurance_Life Insurance_Home Insurance_Accidents Insurance_Mobile Insurance_ATM Insurance_Unemployment TotalInsurances
0 5928737 0.0 -0.361090 -1.35619 -0.383097 -0.1513 0.702561 -0.240698 0.862295 -1.070796 ... -0.431961 -0.251284 -0.639048 -0.489723 -0.412265 -0.43036 -0.287629 -0.410115 -0.290693 -0.675250
1 6018047 1.0 2.769392 0.73736 -0.383097 -0.1513 0.702561 -0.240698 0.862295 1.159183 ... -0.431961 -0.251284 -0.196695 -0.489723 -0.412265 -0.43036 -0.287629 -0.410115 -0.290693 -0.675250
2 5359038 1.0 2.769392 0.73736 2.610303 -0.1513 0.702561 -0.240698 0.862295 -0.327469 ... -0.431961 1.489019 2.698708 -0.489723 -0.412265 -0.43036 -0.287629 -0.410115 -0.290693 -0.675250
3 6890812 0.0 -0.361090 -1.35619 2.610303 -0.1513 0.702561 -0.240698 0.862295 -0.327469 ... 0.312898 -0.251284 2.296569 2.041969 -0.412265 -0.43036 -0.287629 -0.410115 3.440055 1.027172
4 115383 0.0 -0.361090 -1.35619 -0.383097 -0.1513 -1.423364 -0.240698 0.862295 1.902509 ... -0.431961 -0.251284 -0.639048 -0.489723 -0.412265 -0.43036 -0.287629 -0.410115 -0.290693 -0.675250

5 rows × 87 columns

searchCV¶

In [ ]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV


target_column = 'TGT'
numerical_cols = [x for x in ABT.columns if (x != 'client_id') & (x != 'TGT')]

estimator = XGBClassifier(
    objective= 'binary:logistic',
    seed=42
)

parameters = {
    'max_depth': np.arange(6, 10, 1),
    'learning_rate': np.arange(0.01, 1, 0.05),
    'gamma': np.arange(0.1, 2, 0.1),
    'alpha': np.arange(0,10,1),
    'lambda': np.arange(0,10,1),
    'subsample': np.arange(0.1, 1, 0.1),
    'n_estimators':  np.arange(15, 20, 1)
}

#This parameter defines the number of HP points to be tested
n_HP_points_to_test = 100

grid_search = RandomizedSearchCV(
    estimator=estimator,
    param_distributions=parameters,
    n_iter= n_HP_points_to_test,
    scoring='roc_auc',
    cv=3,
    refit=True,
    verbose=False)

grid_search.fit(ABT[numerical_cols], ABT[target_column])
Out[ ]:
RandomizedSearchCV(cv=3,
                   estimator=XGBClassifier(base_score=None, booster=None,
                                           callbacks=None,
                                           colsample_bylevel=None,
                                           colsample_bynode=None,
                                           colsample_bytree=None, device=None,
                                           early_stopping_rounds=None,
                                           enable_categorical=False,
                                           eval_metric=None, feature_types=None,
                                           gamma=None, grow_policy=None,
                                           importance_type=None,
                                           interaction_constraints=None,
                                           learning_rate...
                                        'gamma': array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. , 1.1, 1.2, 1.3,
       1.4, 1.5, 1.6, 1.7, 1.8, 1.9]),
                                        'lambda': array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
                                        'learning_rate': array([0.01, 0.06, 0.11, 0.16, 0.21, 0.26, 0.31, 0.36, 0.41, 0.46, 0.51,
       0.56, 0.61, 0.66, 0.71, 0.76, 0.81, 0.86, 0.91, 0.96]),
                                        'max_depth': array([6, 7, 8, 9]),
                                        'n_estimators': array([15, 16, 17, 18, 19]),
                                        'subsample': array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])},
                   scoring='roc_auc', verbose=False)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomizedSearchCV(cv=3,
                   estimator=XGBClassifier(base_score=None, booster=None,
                                           callbacks=None,
                                           colsample_bylevel=None,
                                           colsample_bynode=None,
                                           colsample_bytree=None, device=None,
                                           early_stopping_rounds=None,
                                           enable_categorical=False,
                                           eval_metric=None, feature_types=None,
                                           gamma=None, grow_policy=None,
                                           importance_type=None,
                                           interaction_constraints=None,
                                           learning_rate...
                                        'gamma': array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. , 1.1, 1.2, 1.3,
       1.4, 1.5, 1.6, 1.7, 1.8, 1.9]),
                                        'lambda': array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
                                        'learning_rate': array([0.01, 0.06, 0.11, 0.16, 0.21, 0.26, 0.31, 0.36, 0.41, 0.46, 0.51,
       0.56, 0.61, 0.66, 0.71, 0.76, 0.81, 0.86, 0.91, 0.96]),
                                        'max_depth': array([6, 7, 8, 9]),
                                        'n_estimators': array([15, 16, 17, 18, 19]),
                                        'subsample': array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])},
                   scoring='roc_auc', verbose=False)
XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=None, n_jobs=None,
              num_parallel_tree=None, random_state=None, ...)
XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=None, n_jobs=None,
              num_parallel_tree=None, random_state=None, ...)
In [ ]:
from xgboost import plot_importance
plot_importance(grid_search.best_estimator_,max_num_features=25)
plt.show()
No description has been provided for this image

importance feature¶

In [ ]:
feature_importance = grid_search.best_estimator_.get_booster().get_score()
keys = list(feature_importance.keys())
values = list(feature_importance.values())
data = pd.DataFrame(data=values, index=keys, columns=["score"]).sort_values(by="score", ascending=False)
a=data.head(50)
In [ ]:
a.to_csv(r'result/top_50_Features.csv', sep='|')
In [ ]:
a = pd.read_csv(r'result/top_50_Features.csv', sep='|')

a.columns = ['column', 'order']
a
Out[ ]:
column order
0 CreditCard_Balance_ARG_var 62.0
1 TotalOperations 61.0
2 CreditCard_Total_Limit 58.0
3 CreditCard_Total_Spending 50.0
4 SavingAccount_Total_Amount 48.0
5 Operations_Bank 44.0
6 Client_Age_grp 44.0
7 Operations_Terminal 42.0
8 CreditCard_Total_Spending_var 40.0
9 Operations_HomeBanking_var 40.0
10 CreditCard_Revolving_min 38.0
11 CreditCard_Total_Limit_var 38.0
12 CreditCard_Revolving 37.0
13 CreditCard_Spending_Aut_Debits_var 36.0
14 SavingAccount_Transactions_Transactions_var 35.0
15 SavingAccount_Total_Amount_var 34.0
16 CreditCard_Balance_ARG 33.0
17 CreditCard_Revolving_var 31.0
18 CreditCard_Spending_Aut_Debits 30.0
19 SavingAccount_Days_with_Credits_var 29.0
20 SavingAccount_Credits_Transactions_var 27.0
21 CreditCard_Balance_DOLLAR_sum 24.0
22 TotalInsurances 23.0
23 Operations_Telemarketer 21.0
24 Operations_Mobile_var 21.0
25 CreditCard_Active 20.0
26 SavingAccount_Transfer_In_Amount_var 19.0
27 SavingAccount_DebitCard_Spend_Transactions 19.0
28 SavingAccount_DebitCard_Spend_Amount_var 17.0
29 SavingAccount_Salary_Payment_Amount_var 17.0
30 CreditCard_Spending_1_Installment_min 17.0
31 SavingAccount_Days_with_Debits_var 17.0
32 Operations_ATM 16.0
33 SavingAccount_Active_ARG_Salary 15.0
34 SavingAccount_Transfer_Out_Amount_sum 15.0
35 SavingAccount_Salary_Payment_Transactions 15.0
36 CreditCard_Spending_Aut_Debits_nunique 15.0
37 Operations_HomeBanking_x 15.0
38 DebitCard_Active 14.0
39 Loan_Active 14.0
40 SavingAccount_Days_with_Debits_nunique 13.0
41 SavingAccount_CreditCard_Payment_Amount 12.0
42 Operations_Ivr 11.0
43 CreditCard_Spending_1_Installment_nunique 10.0
44 Region_REGION CUYO 10.0
45 CreditCard_Spending_Aut_Debits_count_nonzero 10.0
46 Mobile 10.0
47 SavingAccount_Days_with_Credits_count_nonzero 10.0
48 Region_BUENOS AIRES 9.0
49 Region_REGION CENTRO 9.0

Bivariate Analysis¶

In [ ]:
def plot_numerical_data(data_col, target_col = 'TGT', _df = ABT, bins=None, qs = None):
    print (data_col)
    df = _df.copy()
    if (bins == None):
        df['rank'] = round(df[data_col].rank(pct=True) * 9)
    else:
        df['rank'] = pd.cut(df[data_col], bins)
    if (qs != None):
        df['rank'] = pd.qcut(df[data_col], qs, duplicates='drop')
    rank = pd.DataFrame(
        df.groupby(['rank'])[data_col].agg([np.min, np.max, np.average])
    ).reset_index()

    groupped = pd.DataFrame(
        df.groupby(['rank'])[['client_id']].agg('nunique', np.sum).astype('int64')
    ).reset_index()
    groupped.columns = ['rank', '# of clients']

    target = pd.DataFrame(
        df.groupby(['rank'])[[target_col]].agg(np.sum).astype('int64')
    ).reset_index() 
    target.columns = ['rank', '# of target clients']
    
    merged = rank.merge(groupped, how='left').merge(target, how='left')
    merged['target_p'] = (
        (merged['# of target clients'] / merged['# of clients']) * 100
    )
    display(merged)
    # Plot
    merged['# of clients'].plot(kind='bar')
    merged['target_p'].plot(secondary_y=True, color='g')
    plt.show()
    return merged
In [ ]:
a
Out[ ]:
column order
0 CreditCard_Balance_ARG_var 62.0
1 TotalOperations 61.0
2 CreditCard_Total_Limit 58.0
3 CreditCard_Total_Spending 50.0
4 SavingAccount_Total_Amount 48.0
5 Operations_Bank 44.0
6 Client_Age_grp 44.0
7 Operations_Terminal 42.0
8 CreditCard_Total_Spending_var 40.0
9 Operations_HomeBanking_var 40.0
10 CreditCard_Revolving_min 38.0
11 CreditCard_Total_Limit_var 38.0
12 CreditCard_Revolving 37.0
13 CreditCard_Spending_Aut_Debits_var 36.0
14 SavingAccount_Transactions_Transactions_var 35.0
15 SavingAccount_Total_Amount_var 34.0
16 CreditCard_Balance_ARG 33.0
17 CreditCard_Revolving_var 31.0
18 CreditCard_Spending_Aut_Debits 30.0
19 SavingAccount_Days_with_Credits_var 29.0
20 SavingAccount_Credits_Transactions_var 27.0
21 CreditCard_Balance_DOLLAR_sum 24.0
22 TotalInsurances 23.0
23 Operations_Telemarketer 21.0
24 Operations_Mobile_var 21.0
25 CreditCard_Active 20.0
26 SavingAccount_Transfer_In_Amount_var 19.0
27 SavingAccount_DebitCard_Spend_Transactions 19.0
28 SavingAccount_DebitCard_Spend_Amount_var 17.0
29 SavingAccount_Salary_Payment_Amount_var 17.0
30 CreditCard_Spending_1_Installment_min 17.0
31 SavingAccount_Days_with_Debits_var 17.0
32 Operations_ATM 16.0
33 SavingAccount_Active_ARG_Salary 15.0
34 SavingAccount_Transfer_Out_Amount_sum 15.0
35 SavingAccount_Salary_Payment_Transactions 15.0
36 CreditCard_Spending_Aut_Debits_nunique 15.0
37 Operations_HomeBanking_x 15.0
38 DebitCard_Active 14.0
39 Loan_Active 14.0
40 SavingAccount_Days_with_Debits_nunique 13.0
41 SavingAccount_CreditCard_Payment_Amount 12.0
42 Operations_Ivr 11.0
43 CreditCard_Spending_1_Installment_nunique 10.0
44 Region_REGION CUYO 10.0
45 CreditCard_Spending_Aut_Debits_count_nonzero 10.0
46 Mobile 10.0
47 SavingAccount_Days_with_Credits_count_nonzero 10.0
48 Region_BUENOS AIRES 9.0
49 Region_REGION CENTRO 9.0
In [ ]:
most_important=a['column']
for columna in most_important:
    plot_numerical_data(columna)
CreditCard_Balance_ARG_var
rank min max average # of clients # of target clients target_p
0 1.0 -0.333423 -0.333423 -0.333423 7363 622 8.447644
1 3.0 -0.333423 -0.329075 -0.332007 1655 450 27.190332
2 4.0 -0.329074 -0.311315 -0.321289 2577 1069 41.482344
3 5.0 -0.311312 -0.272232 -0.294380 2577 1090 42.297245
4 6.0 -0.272174 -0.179269 -0.232096 2577 1074 41.676368
5 7.0 -0.179199 0.072656 -0.077905 2576 1085 42.119565
6 8.0 0.072684 1.204171 0.459300 2577 1049 40.706248
7 9.0 1.205259 24.458540 3.263167 1289 384 29.790535
No description has been provided for this image
TotalOperations
rank min max average # of clients # of target clients target_p
0 1.0 -0.639048 -0.639048 -0.639048 5666 575 10.148253
1 2.0 -0.598834 -0.598834 -0.598834 1545 257 16.634304
2 3.0 -0.558620 -0.518406 -0.540265 1985 381 19.193955
3 4.0 -0.478192 -0.397764 -0.433633 2767 694 25.081316
4 5.0 -0.357550 -0.277122 -0.321452 2179 689 31.620009
5 6.0 -0.236909 -0.035839 -0.149538 2479 941 37.958854
6 7.0 0.004375 0.567370 0.241806 2766 1182 42.733189
7 8.0 0.607584 2.135713 1.225947 2513 1313 52.248309
8 9.0 2.175927 10.138283 3.206693 1291 791 61.270333
No description has been provided for this image
CreditCard_Total_Limit
rank min max average # of clients # of target clients target_p
0 1.0 -0.878796 -0.878796 -0.878796 7427 554 7.459270
1 3.0 -0.869794 -0.446705 -0.554437 1748 546 31.235698
2 4.0 -0.374690 -0.302674 -0.328121 1599 589 36.835522
3 5.0 -0.158644 -0.014614 -0.081112 3851 1528 39.678006
4 6.0 0.021394 0.201432 0.129444 1311 589 44.927536
5 7.0 0.273447 0.561508 0.402769 3442 1475 42.852992
6 8.0 0.849568 1.641735 1.109193 2373 990 41.719343
7 9.0 2.001811 4.162266 2.878371 1440 552 38.333333
No description has been provided for this image
CreditCard_Total_Spending
rank min max average # of clients # of target clients target_p
0 0.0 -2.069368 -0.650194 -0.842358 18 3 16.666667
1 2.0 -0.649929 -0.649929 -0.649929 9377 837 8.926096
2 4.0 -0.649781 -0.439504 -0.546846 2201 741 33.666515
3 5.0 -0.439478 -0.174241 -0.308974 2576 1067 41.420807
4 6.0 -0.174133 0.155760 -0.017973 2577 1156 44.858362
5 7.0 0.156280 0.680816 0.387786 2576 1197 46.467391
6 8.0 0.680919 2.010680 1.194128 2577 1189 46.138921
7 9.0 2.011431 4.315257 3.164614 1289 633 49.107836
No description has been provided for this image
SavingAccount_Total_Amount
rank min max average # of clients # of target clients target_p
0 2.0 -0.535720 -0.535720 -0.535720 11770 2409 20.467290
1 5.0 -0.535719 -0.353948 -0.458817 2402 532 22.148210
2 6.0 -0.353872 -0.113505 -0.246878 2577 712 27.629026
3 7.0 -0.113304 0.487357 0.127233 2576 1060 41.149068
4 8.0 0.487399 3.069775 1.447382 2577 1385 53.744664
5 9.0 3.071361 3.092456 3.092357 1289 725 56.245151
No description has been provided for this image
Operations_Bank
rank min max average # of clients # of target clients target_p
0 3.0 -0.339982 -0.339982 -0.339982 17488 4236 24.222324
1 7.0 0.105446 0.105446 0.105446 2475 1032 41.696970
2 8.0 0.550874 1.441731 0.841044 2014 904 44.885799
3 9.0 1.887159 42.421137 3.287287 1214 651 53.624382
No description has been provided for this image
Client_Age_grp
rank min max average # of clients # of target clients target_p
0 0.0 -1.962787 -1.962787 -1.962787 989 249 25.176946
1 1.0 -1.070796 -1.070796 -1.070796 5308 1214 22.871138
2 4.0 -0.327469 -0.327469 -0.327469 6563 1660 25.293311
3 6.0 0.415857 0.415857 0.415857 5149 1796 34.880559
4 7.0 1.159183 1.159183 1.159183 2122 776 36.569274
5 8.0 1.530846 1.530846 1.530846 1745 694 39.770774
6 9.0 1.902509 1.902509 1.902509 1315 434 33.003802
No description has been provided for this image
Operations_Terminal
rank min max average # of clients # of target clients target_p
0 3.0 -0.485396 -0.485396 -0.485396 14773 3178 21.512218
1 6.0 -0.245225 -0.245225 -0.245225 1741 605 34.750144
2 7.0 -0.005054 0.715460 0.344768 2992 1123 37.533422
3 8.0 0.955631 1.676145 1.169663 2490 1161 46.626506
4 9.0 1.916316 24.732580 3.057481 1195 756 63.263598
No description has been provided for this image
CreditCard_Total_Spending_var
rank min max average # of clients # of target clients target_p
0 2.0 -0.364309 -0.364309 -0.364309 8555 738 8.626534
1 3.0 -0.364309 -0.363583 -0.364076 463 150 32.397408
2 4.0 -0.363579 -0.340502 -0.354216 2577 992 38.494373
3 5.0 -0.340495 -0.289687 -0.318348 2577 1111 43.112146
4 6.0 -0.289648 -0.173521 -0.239913 2577 1133 43.965852
5 7.0 -0.173507 0.116739 -0.055659 2576 1082 42.003106
6 8.0 0.116846 1.379620 0.563281 2577 1096 42.530074
7 9.0 1.380478 16.479469 3.358022 1289 521 40.418929
No description has been provided for this image
Operations_HomeBanking_var
rank min max average # of clients # of target clients target_p
0 2.0 -0.379385 -0.379385 -0.379385 12506 2933 23.452743
1 5.0 -0.326504 -0.326504 -0.326504 1995 564 28.270677
2 6.0 -0.294776 -0.167862 -0.256302 2021 580 28.698664
3 7.0 -0.167862 0.181151 -0.046336 2766 968 34.996385
4 8.0 0.181151 1.524321 0.662021 2610 1173 44.942529
5 9.0 1.524321 16.806844 3.336611 1293 605 46.790410
No description has been provided for this image
CreditCard_Revolving_min
rank min max average # of clients # of target clients target_p
0 0.0 -6.168433 -0.712710 -2.280568 1288 571 44.332298
1 1.0 -0.712662 -0.086175 -0.278155 2577 1104 42.840512
2 2.0 -0.086037 -0.026004 -0.042562 2577 998 38.727202
3 3.0 -0.026000 -0.020023 -0.021423 2574 823 31.973582
4 4.0 -0.020021 -0.019922 -0.019969 571 143 25.043783
5 6.0 -0.019921 -0.019921 -0.019921 11110 2301 20.711071
6 8.0 -0.019919 1.108721 0.387373 1205 427 35.435685
7 9.0 1.112892 6.128592 2.781178 1289 456 35.376261
No description has been provided for this image
CreditCard_Total_Limit_var
rank min max average # of clients # of target clients target_p
0 3.0 -0.216976 -0.216976 -0.216976 16387 5216 31.830109
1 6.0 -0.216952 -0.195942 -0.206782 335 78 23.283582
2 7.0 -0.195438 -0.009670 -0.111716 2603 683 26.238955
3 8.0 -0.008997 0.644553 0.214651 2574 623 24.203574
4 9.0 0.644553 29.465396 2.603049 1292 223 17.260062
No description has been provided for this image
CreditCard_Revolving
rank min max average # of clients # of target clients target_p
0 0.0 -5.612467 -0.336828 -0.883084 1288 528 40.993789
1 1.0 -0.336769 -0.295570 -0.302413 2575 974 37.825243
2 2.0 -0.295569 -0.295435 -0.295494 900 302 33.555556
3 4.0 -0.295434 -0.295434 -0.295434 13356 3180 23.809524
4 7.0 -0.295432 -0.122488 -0.253519 1206 461 38.225539
5 8.0 -0.122321 2.042820 0.750355 2577 947 36.748157
6 9.0 2.043695 5.021600 3.491048 1289 431 33.436773
No description has been provided for this image
CreditCard_Spending_Aut_Debits_var
rank min max average # of clients # of target clients target_p
0 3.0 -0.184095 -0.184095 -0.184095 13308 2428 18.244665
1 5.0 -0.184095 -0.183724 -0.183931 868 371 42.741935
2 6.0 -0.183719 -0.175336 -0.180889 2574 1120 43.512044
3 7.0 -0.175329 -0.115869 -0.155630 2575 1158 44.970874
4 8.0 -0.115839 0.437216 0.060002 2577 1202 46.643384
5 9.0 0.439633 23.465088 2.576660 1289 544 42.203258
No description has been provided for this image
SavingAccount_Transactions_Transactions_var
rank min max average # of clients # of target clients target_p
0 2.0 -0.324941 -0.324941 -0.324941 10935 2277 20.823045
1 4.0 -0.319016 -0.319016 -0.319016 646 119 18.421053
2 5.0 -0.315461 -0.285836 -0.302442 2609 702 26.906861
3 6.0 -0.282281 -0.211182 -0.251483 2572 805 31.298600
4 7.0 -0.211182 0.025817 -0.118617 2564 999 38.962559
5 8.0 0.025817 1.344714 0.488846 2576 1274 49.456522
6 9.0 1.348269 18.682349 3.289422 1289 647 50.193949
No description has been provided for this image
SavingAccount_Total_Amount_var
rank min max average # of clients # of target clients target_p
0 2.0 -0.335904 -0.335904 -0.335904 10684 2404 22.500936
1 4.0 -0.335904 -0.335896 -0.335903 911 91 9.989023
2 5.0 -0.335896 -0.327482 -0.333761 2577 578 22.429181
3 6.0 -0.327476 -0.274374 -0.306968 2577 868 33.682577
4 7.0 -0.274294 -0.001548 -0.171869 2576 1084 42.080745
5 8.0 -0.001374 1.575777 0.513347 2577 1308 50.756694
6 9.0 1.585892 7.529617 3.619710 1289 490 38.013964
No description has been provided for this image
CreditCard_Balance_ARG
rank min max average # of clients # of target clients target_p
0 0.0 -5.548031 -0.653676 -0.741103 766 144 18.798956
1 2.0 -0.653675 -0.653675 -0.653675 7961 677 8.503957
2 3.0 -0.653674 -0.626485 -0.645485 291 63 21.649485
3 4.0 -0.626203 -0.418643 -0.519399 2577 955 37.058595
4 5.0 -0.418577 -0.188284 -0.307576 2577 1081 41.948002
5 6.0 -0.188089 0.130912 -0.044256 2577 1148 44.547924
6 7.0 0.130944 0.686263 0.380433 2576 1120 43.478261
7 8.0 0.686605 2.065600 1.231719 2577 1133 43.965852
8 9.0 2.066048 4.240681 3.142321 1289 502 38.944919
No description has been provided for this image
CreditCard_Revolving_var
rank min max average # of clients # of target clients target_p
0 2.0 -0.283414 -0.283414 -0.283414 9790 1830 18.692543
1 4.0 -0.283414 -0.283410 -0.283413 1805 558 30.914127
2 5.0 -0.283410 -0.281129 -0.282967 2577 1031 40.007761
3 6.0 -0.281127 -0.245192 -0.268607 2577 1028 39.891347
4 7.0 -0.245144 -0.059331 -0.177578 2576 995 38.625776
5 8.0 -0.059146 1.110684 0.320401 2577 997 38.688397
6 9.0 1.120624 16.779817 3.366449 1289 384 29.790535
No description has been provided for this image
CreditCard_Spending_Aut_Debits
rank min max average # of clients # of target clients target_p
0 0.0 -4.555431 -0.508876 -0.892626 18 4 22.222222
1 3.0 -0.473121 -0.473121 -0.473121 13065 2183 16.708764
2 5.0 -0.473078 -0.370593 -0.410499 1090 466 42.752294
3 6.0 -0.370158 -0.146712 -0.283454 2576 1115 43.284161
4 7.0 -0.146501 0.457383 0.121738 2576 1188 46.118012
5 8.0 0.457943 1.952927 1.027224 2577 1210 46.953822
6 9.0 1.955667 4.916441 3.424559 1289 657 50.969744
No description has been provided for this image
SavingAccount_Days_with_Credits_var
rank min max average # of clients # of target clients target_p
0 2.0 -0.514025 -0.514025 -0.514025 11877 2531 21.310095
1 5.0 -0.338605 -0.233353 -0.324791 2141 656 30.639888
2 6.0 -0.233353 0.082402 -0.115928 3101 1032 33.279587
3 7.0 0.082402 0.503409 0.260621 2331 850 36.465036
4 8.0 0.503409 1.766431 0.953721 2422 1120 46.242775
5 9.0 1.766431 13.554633 3.216469 1319 634 48.066717
No description has been provided for this image
SavingAccount_Credits_Transactions_var
rank min max average # of clients # of target clients target_p
0 2.0 -0.466557 -0.466557 -0.466557 11555 2392 20.700995
1 4.0 -0.386273 -0.386273 -0.386273 3 0 0.000000
2 5.0 -0.386273 -0.322046 -0.359128 2663 760 28.539241
3 6.0 -0.322046 -0.129366 -0.176838 2530 826 32.648221
4 7.0 -0.129366 0.304165 0.102248 2573 976 37.932375
5 8.0 0.304165 1.733210 0.861197 2578 1224 47.478666
6 9.0 1.733210 14.177146 3.345793 1289 645 50.038790
No description has been provided for this image
CreditCard_Balance_DOLLAR_sum
rank min max average # of clients # of target clients target_p
0 0.0 -13.787115 -0.220613 -1.725336 269 116 43.122677
1 4.0 -0.220613 -0.220613 -0.220613 19933 5492 27.552300
2 8.0 -0.220443 1.497041 0.372545 1700 700 41.176471
3 9.0 1.500672 13.345889 3.280270 1289 515 39.953452
No description has been provided for this image
TotalInsurances
rank min max average # of clients # of target clients target_p
0 3.0 -0.675250 -0.675250 -0.675250 13686 3061 22.365921
1 6.0 0.175961 0.175961 0.175961 4333 1536 35.448881
2 8.0 1.027172 1.878383 1.347880 4244 1721 40.551367
3 9.0 2.729594 4.432016 2.972666 928 505 54.418103
No description has been provided for this image
Operations_Telemarketer
rank min max average # of clients # of target clients target_p
0 3.0 -0.431961 -0.431961 -0.431961 16570 4252 25.660833
1 7.0 0.312898 0.312898 0.312898 3634 1291 35.525592
2 8.0 1.057756 1.057756 1.057756 1456 600 41.208791
3 9.0 1.802614 33.086659 2.926469 1531 680 44.415415
No description has been provided for this image
Operations_Mobile_var
rank min max average # of clients # of target clients target_p
0 4.0 -0.241753 -0.241753 -0.241753 19430 5197 26.747298
1 8.0 -0.193304 1.085745 0.180265 2472 964 38.996764
2 9.0 1.085745 14.951795 3.298401 1289 662 51.357642
No description has been provided for this image
CreditCard_Active
rank min max average # of clients # of target clients target_p
0 2.0 -1.35619 -1.35619 -1.35619 8168 513 6.280607
1 6.0 0.73736 0.73736 0.73736 15023 6310 42.002263
No description has been provided for this image
SavingAccount_Transfer_In_Amount_var
rank min max average # of clients # of target clients target_p
0 3.0 -0.375733 -0.375733 -0.375733 17025 4753 27.917768
1 7.0 -0.375733 -0.041883 -0.275449 2300 616 26.782609
2 8.0 -0.041748 2.611488 0.987892 2577 866 33.604967
3 9.0 2.612386 5.351821 3.479117 1289 588 45.616757
No description has been provided for this image
SavingAccount_DebitCard_Spend_Transactions
rank min max average # of clients # of target clients target_p
0 4.0 -0.293537 -0.293537 -0.293537 20041 5096 25.427873
1 8.0 -0.041914 1.719450 0.562755 1888 928 49.152542
2 9.0 1.971074 5.242178 3.819569 1262 799 63.312203
No description has been provided for this image
SavingAccount_DebitCard_Spend_Amount_var
rank min max average # of clients # of target clients target_p
0 4.0 -0.313453 -0.313453 -0.313453 19147 4878 25.476576
1 7.0 -0.313453 -0.309404 -0.311838 180 55 30.555556
2 8.0 -0.309402 2.293676 0.490071 2575 1256 48.776699
3 9.0 2.295398 7.033519 3.720617 1289 634 49.185415
No description has been provided for this image
SavingAccount_Salary_Payment_Amount_var
rank min max average # of clients # of target clients target_p
0 4.0 -0.154153 -0.154153 -0.154153 21791 6053 27.777523
1 8.0 -0.154153 -0.122156 -0.143151 111 57 51.351351
2 9.0 -0.122080 12.279220 2.618346 1289 713 55.314197
No description has been provided for this image
CreditCard_Spending_1_Installment_min
rank min max average # of clients # of target clients target_p
0 0.0 -13.199359 -0.207642 -2.069425 146 55 37.671233
1 4.0 -0.206982 -0.206982 -0.206982 18271 4499 24.623721
2 7.0 -0.206321 -0.095421 -0.119688 1099 494 44.949955
3 8.0 -0.094761 0.802558 0.155231 2386 1140 47.778709
4 9.0 0.803006 12.785396 2.982973 1289 635 49.262995
No description has been provided for this image
SavingAccount_Days_with_Debits_var
rank min max average # of clients # of target clients target_p
0 2.0 -0.408663 -0.408663 -0.408663 12092 2688 22.229573
1 5.0 -0.364147 -0.272408 -0.326267 1788 526 29.418345
2 6.0 -0.272408 -0.119121 -0.249566 2545 831 32.652259
3 7.0 -0.119121 0.204484 -0.017488 2881 1047 36.341548
4 8.0 0.204484 1.570371 0.710316 2597 1180 45.437043
5 9.0 1.584065 12.524903 3.389561 1288 551 42.779503
No description has been provided for this image
Operations_ATM
rank min max average # of clients # of target clients target_p
0 4.0 -0.251284 -0.251284 -0.251284 19651 5029 25.591573
1 8.0 -0.002669 0.991790 0.261233 2228 1005 45.107720
2 9.0 1.240405 26.847729 3.320090 1312 789 60.137195
No description has been provided for this image
SavingAccount_Active_ARG_Salary
rank min max average # of clients # of target clients target_p
0 4.0 -0.335318 -0.335318 -0.335318 20847 5453 26.157241
1 9.0 2.982243 2.982243 2.982243 2344 1370 58.447099
No description has been provided for this image
SavingAccount_Transfer_Out_Amount_sum
rank min max average # of clients # of target clients target_p
0 4.0 -0.230216 -0.230216 -0.230216 20709 5532 26.713023
1 8.0 -0.230131 1.197264 0.251976 1193 573 48.030176
2 9.0 1.197851 10.713349 3.465422 1289 718 55.702095
No description has been provided for this image
SavingAccount_Salary_Payment_Transactions
rank min max average # of clients # of target clients target_p
0 4.0 -0.269507 -0.269507 -0.269507 21283 5583 26.232204
1 8.0 1.607974 1.607974 1.607974 930 585 62.903226
2 9.0 3.485454 5.362935 4.335888 978 655 66.973415
No description has been provided for this image
CreditCard_Spending_Aut_Debits_nunique
rank min max average # of clients # of target clients target_p
0 3.0 -0.717120 -0.717120 -0.717120 13308 2428 18.244665
1 6.0 -0.181371 0.354379 0.046653 4013 1613 40.194368
2 7.0 0.890128 0.890128 0.890128 1361 612 44.966936
3 8.0 1.425878 1.961627 1.806332 4509 2170 48.125970
No description has been provided for this image
Operations_HomeBanking_x
rank min max average # of clients # of target clients target_p
0 3.0 -0.428762 -0.428762 -0.428762 16415 4053 24.690832
1 7.0 -0.061738 0.305287 0.070739 3416 1126 32.962529
2 8.0 0.672311 1.773385 1.082601 2019 875 43.338286
3 9.0 2.140409 5.076606 3.438270 1341 769 57.345265
No description has been provided for this image
DebitCard_Active
rank min max average # of clients # of target clients target_p
0 1.0 -1.423364 -1.423364 -1.423364 7664 1800 23.48643
1 6.0 0.702561 0.702561 0.702561 15527 5023 32.35010
No description has been provided for this image
Loan_Active
rank min max average # of clients # of target clients target_p
0 4.0 -0.383097 -0.383097 -0.383097 20223 5470 27.048410
1 8.0 2.610303 2.610303 2.610303 2968 1353 45.586253
No description has been provided for this image
SavingAccount_Days_with_Debits_nunique
rank min max average # of clients # of target clients target_p
0 2.0 -0.788464 -0.788464 -0.788464 12092 2688 22.229573
1 6.0 0.040048 0.040048 0.040048 4309 1346 31.236946
2 7.0 0.868561 0.868561 0.868561 3690 1386 37.560976
3 8.0 1.697073 1.697073 1.697073 2145 983 45.827506
4 9.0 2.525585 3.354098 2.634897 955 420 43.979058
No description has been provided for this image
SavingAccount_CreditCard_Payment_Amount
rank min max average # of clients # of target clients target_p
0 3.0 -0.369675 -0.369675 -0.369675 17551 3639 20.733861
1 7.0 -0.369673 0.187913 -0.122015 1774 911 51.352875
2 8.0 0.188039 1.866350 0.827353 2577 1489 57.780365
3 9.0 1.869587 5.489941 3.547350 1289 784 60.822343
No description has been provided for this image
Operations_Ivr
rank min max average # of clients # of target clients target_p
0 4.0 -0.198273 -0.198273 -0.198273 18778 5125 27.292576
1 8.0 0.206850 0.206850 0.206850 2747 1017 37.022206
2 9.0 0.611973 39.908903 1.893728 1666 681 40.876351
No description has been provided for this image
CreditCard_Spending_1_Installment_nunique
rank min max average # of clients # of target clients target_p
0 2.0 -0.857834 -0.857834 -0.857834 11364 1686 14.836325
1 5.0 -0.362342 -0.362342 -0.362342 2059 803 38.999514
2 6.0 0.133151 0.628644 0.374648 3568 1452 40.695067
3 7.0 1.124137 1.124137 1.124137 1784 802 44.955157
4 8.0 1.619630 1.619630 1.619630 4416 2080 47.101449
No description has been provided for this image
Region_REGION CUYO
rank min max average # of clients # of target clients target_p
0 4.0 -0.275860 -0.275860 -0.275860 21551 6334 29.390748
1 9.0 3.625032 3.625032 3.625032 1640 489 29.817073
No description has been provided for this image
CreditCard_Spending_Aut_Debits_count_nonzero
rank min max average # of clients # of target clients target_p
0 2.0 -0.909830 -0.909830 -0.909830 12293 2000 16.269422
1 5.0 -0.565618 0.811232 0.200570 1474 485 32.903664
2 7.0 1.155444 1.155444 1.155444 9424 4338 46.031409
No description has been provided for this image
Mobile
rank min max average # of clients # of target clients target_p
0 1.0 -2.721880 -2.721880 -2.721880 2758 647 23.459028
1 5.0 0.367393 0.367393 0.367393 20433 6176 30.225615
No description has been provided for this image
SavingAccount_Days_with_Credits_count_nonzero
rank min max average # of clients # of target clients target_p
0 2.0 -1.026614 -1.026614 -1.026614 10409 2188 21.020271
1 4.0 -0.673649 -0.320684 -0.535876 1158 243 20.984456
2 5.0 0.032281 0.738212 0.480955 2257 625 27.691626
3 7.0 1.091177 1.091177 1.091177 9367 3767 40.215651
No description has been provided for this image
Region_BUENOS AIRES
rank min max average # of clients # of target clients target_p
0 3.0 -0.656852 -0.656852 -0.656852 16201 4685 28.917968
1 8.0 1.522412 1.522412 1.522412 6990 2138 30.586552
No description has been provided for this image
Region_REGION CENTRO
rank min max average # of clients # of target clients target_p
0 4.0 -0.496008 -0.496008 -0.496008 18612 5707 30.663013
1 8.0 2.016096 2.016096 2.016096 4579 1116 24.372134
No description has been provided for this image

Select the main features¶

In [ ]:
selected_features = list(a['column'][:50])
selected_features += ['client_id', 'TGT']
In [ ]:
selected_features
Out[ ]:
['CreditCard_Balance_ARG_var',
 'TotalOperations',
 'CreditCard_Total_Limit',
 'CreditCard_Total_Spending',
 'SavingAccount_Total_Amount',
 'Operations_Bank',
 'Client_Age_grp',
 'Operations_Terminal',
 'CreditCard_Total_Spending_var',
 'Operations_HomeBanking_var',
 'CreditCard_Revolving_min',
 'CreditCard_Total_Limit_var',
 'CreditCard_Revolving',
 'CreditCard_Spending_Aut_Debits_var',
 'SavingAccount_Transactions_Transactions_var',
 'SavingAccount_Total_Amount_var',
 'CreditCard_Balance_ARG',
 'CreditCard_Revolving_var',
 'CreditCard_Spending_Aut_Debits',
 'SavingAccount_Days_with_Credits_var',
 'SavingAccount_Credits_Transactions_var',
 'CreditCard_Balance_DOLLAR_sum',
 'TotalInsurances',
 'Operations_Telemarketer',
 'Operations_Mobile_var',
 'CreditCard_Active',
 'SavingAccount_Transfer_In_Amount_var',
 'SavingAccount_DebitCard_Spend_Transactions',
 'SavingAccount_DebitCard_Spend_Amount_var',
 'SavingAccount_Salary_Payment_Amount_var',
 'CreditCard_Spending_1_Installment_min',
 'SavingAccount_Days_with_Debits_var',
 'Operations_ATM',
 'SavingAccount_Active_ARG_Salary',
 'SavingAccount_Transfer_Out_Amount_sum',
 'SavingAccount_Salary_Payment_Transactions',
 'CreditCard_Spending_Aut_Debits_nunique',
 'Operations_HomeBanking_x',
 'DebitCard_Active',
 'Loan_Active',
 'SavingAccount_Days_with_Debits_nunique',
 'SavingAccount_CreditCard_Payment_Amount',
 'Operations_Ivr',
 'CreditCard_Spending_1_Installment_nunique',
 'Region_REGION CUYO',
 'CreditCard_Spending_Aut_Debits_count_nonzero',
 'Mobile',
 'SavingAccount_Days_with_Credits_count_nonzero',
 'Region_BUENOS AIRES',
 'Region_REGION CENTRO',
 'client_id',
 'TGT']
In [ ]:
ABT_Model_Select = ABT[selected_features].copy()

Split in Train and Test¶

In [ ]:
ABT_Model_Select.shape
Out[ ]:
(23191, 52)
In [ ]:
from sklearn.model_selection import train_test_split
X_train, X_test = train_test_split(ABT_Model_Select, test_size=0.3, random_state=42, stratify=ABT_Model_Select['TGT']);  
In [ ]:
X_train.TGT.value_counts()
Out[ ]:
TGT
0.0    11457
1.0     4776
Name: count, dtype: int64
In [ ]:
X_test.TGT.value_counts()
Out[ ]:
TGT
0.0    4911
1.0    2047
Name: count, dtype: int64
In [ ]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.model_selection import StratifiedKFold

target_column = 'TGT'
numerical_cols = [x for x in ABT_Model_Select.columns if (x != 'client_id') & (x != 'TGT')]

estimator = XGBClassifier(
    objective= 'binary:logistic',
    seed=42
)

parameters = {
    'max_depth': np.arange(6, 10, 1),
    'learning_rate': np.arange(0.01, 1, 0.05),
    'gamma': np.arange(0.1, 2, 0.1),
    'alpha': np.arange(0,10,1),
    'lambda': np.arange(0,10,1),
    'subsample': np.arange(0.1, 1, 0.1),
    'n_estimators': np.arange(15, 20, 1)
}

cross_val = StratifiedKFold(n_splits=3) 

grid_search = GridSearchCV(
    estimator=estimator,
    param_grid=parameters,
    scoring = 'roc_auc',
    n_jobs = 4,
    cv = cross_val,
    verbose=True
)

#This parameter defines the number of HP points to be tested
n_HP_points_to_test = 100

grid_search = RandomizedSearchCV(
    estimator=estimator,
    param_distributions=parameters,
    n_iter= n_HP_points_to_test,
    scoring='roc_auc',
    cv= cross_val,
    refit= True,
    verbose= True)

grid_search.fit(X_train[numerical_cols], X_train[target_column])
Fitting 3 folds for each of 100 candidates, totalling 300 fits
Out[ ]:
RandomizedSearchCV(cv=StratifiedKFold(n_splits=3, random_state=None, shuffle=False),
                   estimator=XGBClassifier(base_score=None, booster=None,
                                           callbacks=None,
                                           colsample_bylevel=None,
                                           colsample_bynode=None,
                                           colsample_bytree=None, device=None,
                                           early_stopping_rounds=None,
                                           enable_categorical=False,
                                           eval_metric=None, feature_types=None,
                                           gamma=None, grow_policy=None,
                                           impor...
                                        'gamma': array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. , 1.1, 1.2, 1.3,
       1.4, 1.5, 1.6, 1.7, 1.8, 1.9]),
                                        'lambda': array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
                                        'learning_rate': array([0.01, 0.06, 0.11, 0.16, 0.21, 0.26, 0.31, 0.36, 0.41, 0.46, 0.51,
       0.56, 0.61, 0.66, 0.71, 0.76, 0.81, 0.86, 0.91, 0.96]),
                                        'max_depth': array([6, 7, 8, 9]),
                                        'n_estimators': array([15, 16, 17, 18, 19]),
                                        'subsample': array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])},
                   scoring='roc_auc', verbose=True)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomizedSearchCV(cv=StratifiedKFold(n_splits=3, random_state=None, shuffle=False),
                   estimator=XGBClassifier(base_score=None, booster=None,
                                           callbacks=None,
                                           colsample_bylevel=None,
                                           colsample_bynode=None,
                                           colsample_bytree=None, device=None,
                                           early_stopping_rounds=None,
                                           enable_categorical=False,
                                           eval_metric=None, feature_types=None,
                                           gamma=None, grow_policy=None,
                                           impor...
                                        'gamma': array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. , 1.1, 1.2, 1.3,
       1.4, 1.5, 1.6, 1.7, 1.8, 1.9]),
                                        'lambda': array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
                                        'learning_rate': array([0.01, 0.06, 0.11, 0.16, 0.21, 0.26, 0.31, 0.36, 0.41, 0.46, 0.51,
       0.56, 0.61, 0.66, 0.71, 0.76, 0.81, 0.86, 0.91, 0.96]),
                                        'max_depth': array([6, 7, 8, 9]),
                                        'n_estimators': array([15, 16, 17, 18, 19]),
                                        'subsample': array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])},
                   scoring='roc_auc', verbose=True)
XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=None, n_jobs=None,
              num_parallel_tree=None, random_state=None, ...)
XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=None, n_jobs=None,
              num_parallel_tree=None, random_state=None, ...)
In [ ]:
best_parameters = grid_search.best_estimator_
In [ ]:
best_parameters.get_xgb_params()
Out[ ]:
{'objective': 'binary:logistic',
 'base_score': None,
 'booster': None,
 'colsample_bylevel': None,
 'colsample_bynode': None,
 'colsample_bytree': None,
 'device': None,
 'eval_metric': None,
 'gamma': 1.7000000000000002,
 'grow_policy': None,
 'interaction_constraints': None,
 'learning_rate': 0.26,
 'max_bin': None,
 'max_cat_threshold': None,
 'max_cat_to_onehot': None,
 'max_delta_step': None,
 'max_depth': 7,
 'max_leaves': None,
 'min_child_weight': None,
 'monotone_constraints': None,
 'multi_strategy': None,
 'n_jobs': None,
 'num_parallel_tree': None,
 'random_state': None,
 'reg_alpha': None,
 'reg_lambda': None,
 'sampling_method': None,
 'scale_pos_weight': None,
 'subsample': 0.8,
 'tree_method': None,
 'validate_parameters': None,
 'verbosity': None,
 'seed': 42,
 'lambda': 0,
 'alpha': 9}

Modelling¶

In [ ]:
#create model
model = XGBClassifier(objective='binary:logistic',
                      booster='gbtree',
                      seed=42,
                      gamma=1.7000000000000002,
                      learning_rate=0.26,
                      max_depth=7,
                      subsample=0.8,
                      n_estimators=19,
                     alpha=9,
                                         )

Train¶

In [ ]:
# fit model
model.fit(X_train[numerical_cols], X_train[target_column])
Out[ ]:
XGBClassifier(alpha=9, base_score=None, booster='gbtree', callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=1.7000000000000002, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.26, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=7, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=19, n_jobs=None,
              num_parallel_tree=None, ...)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
XGBClassifier(alpha=9, base_score=None, booster='gbtree', callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=1.7000000000000002, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.26, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=7, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=19, n_jobs=None,
              num_parallel_tree=None, ...)
In [ ]:
from sklearn.metrics import accuracy_score, classification_report
accuracy = accuracy_score(X_train[target_column], model.predict(X_train[numerical_cols])) * 100
print("Accuracy of Model: ",accuracy)
print(classification_report(X_train[target_column], model.predict(X_train[numerical_cols])))
pd.crosstab(X_train[target_column], model.predict(X_train[numerical_cols]))
Accuracy of Model:  81.12486909382123
              precision    recall  f1-score   support

         0.0       0.84      0.90      0.87     11457
         1.0       0.72      0.59      0.65      4776

    accuracy                           0.81     16233
   macro avg       0.78      0.75      0.76     16233
weighted avg       0.80      0.81      0.81     16233

Out[ ]:
col_0 0 1
TGT
0.0 10368 1089
1.0 1975 2801

Test¶

In [ ]:
y_pred = model.predict(X_test[numerical_cols])
In [ ]:
from sklearn.metrics import accuracy_score, classification_report
accuracy = accuracy_score(X_test[target_column], y_pred) * 100
print("Accuracy of Model: ",accuracy)
print(classification_report(X_test[target_column], y_pred))
pd.crosstab(X_test[target_column], y_pred)
Accuracy of Model:  79.04570278815751
              precision    recall  f1-score   support

         0.0       0.82      0.90      0.86      4911
         1.0       0.68      0.54      0.60      2047

    accuracy                           0.79      6958
   macro avg       0.75      0.72      0.73      6958
weighted avg       0.78      0.79      0.78      6958

Out[ ]:
col_0 0 1
TGT
0.0 4396 515
1.0 943 1104

Deciles¶

In [ ]:
# results..
from sklearn.metrics import mean_squared_error

probabilities_train = model.predict_proba(X_train[numerical_cols])
probabilities       = model.predict_proba(X_test[numerical_cols])

y_pred = model.predict(X_test[numerical_cols])
In [ ]:
a = pd.DataFrame(X_train[['client_id', target_column]], columns=['TGT', 'idx'])
a = a.reset_index()
b = pd.DataFrame(probabilities_train[:,1], columns=['Prob1'])

result = pd.concat([a, b], axis=1)

result['porc'] = result['Prob1'].rank(pct=True) * 100

len(probabilities[:,1])

result.loc[result['porc'].between(0, 10, inclusive='neither'), 'decil'] = '10'
result.loc[result['porc'].between(10, 20, inclusive='both'), 'decil'] = '9'
result.loc[result['porc'].between(20, 30, inclusive='neither'), 'decil'] = '8'
result.loc[result['porc'].between(30, 40, inclusive='both'), 'decil'] = '7'
result.loc[result['porc'].between(40, 50, inclusive='neither'), 'decil'] = '6'
result.loc[result['porc'].between(50, 60, inclusive='both'), 'decil'] = '5'
result.loc[result['porc'].between(60, 70, inclusive='neither'), 'decil'] = '4'
result.loc[result['porc'].between(70, 80, inclusive='both'), 'decil'] = '3'
result.loc[result['porc'].between(80, 90, inclusive='neither'), 'decil'] = '2'
result.loc[result['porc'].between(90, 101, inclusive='both'), 'decil'] = '1'

print(result.decil.value_counts().sort_index())
print(result[result.TGT == 1].decil.value_counts().sort_index())

a = result.groupby('decil')['Prob1'].agg('min')
print(a.sort_index())
decil
1     1624
10    1691
2     1623
3     1623
4     1624
5     1623
6     1623
7     1624
8     1623
9     1555
Name: count, dtype: int64
decil
1     1354
10       5
2     1048
3      815
4      594
5      423
6      308
7      159
8       53
9       17
Name: count, dtype: int64
decil
1     0.688578
10    0.012251
2     0.539833
3     0.426468
4     0.332262
5     0.251477
6     0.164156
7     0.075242
8     0.031342
9     0.014199
Name: Prob1, dtype: float32
In [ ]:
##############################################
# test 

a = pd.DataFrame(X_test[[target_column, 'client_id']], columns=['TGT', 'idx'])
a = a.reset_index()
b = pd.DataFrame(probabilities[:,1], columns=['Prob1'])

result = pd.concat([a, b], axis=1)

result['porc'] = result['Prob1'].rank(pct=True) * 100

result['decil'] = np.where(result.Prob1 >= 0.688578               , 1, 
                            np.where((result.Prob1 >=  0.539833            ) & (result.Prob1 <  0.688578       ), 2,
                            np.where((result.Prob1 >=  0.426468               ) & (result.Prob1 < 0.539833                   ) , 3,
                            np.where((result.Prob1 >=  0.332262                ) & (result.Prob1 < 0.426468                 ), 4,
                            np.where((result.Prob1 >=  0.251477                 ) & (result.Prob1 < 0.332262               ), 5,
                            np.where((result.Prob1 >=  0.164156                ) & (result.Prob1 < 0.251477              ), 6,
                            np.where((result.Prob1 >=  0.075242                 ) & (result.Prob1 < 0.164156               ) , 7,
                            np.where((result.Prob1 >=  0.031342               ) & (result.Prob1 < 0.075242              ), 8,
                            np.where((result.Prob1 >=  0.014199                 ) & (result.Prob1 < 0.031342                    ), 9,
                            10)))))))))
print("Total")
print(result.decil.value_counts().sort_index())
print("Buenos")
print(result[result.TGT == 1].decil.value_counts())
Total
decil
1     694
2     702
3     697
4     676
5     664
6     703
7     697
8     711
9     672
10    742
Name: count, dtype: int64
Buenos
decil
1     531
2     452
3     340
4     278
5     173
6     149
7      89
8      28
9       4
10      3
Name: count, dtype: int64

Performance¶

In [ ]:
y_test = X_test['TGT']
y_pred = model.predict(X_test[numerical_cols])
In [ ]:
from sklearn.metrics import confusion_matrix


cm = confusion_matrix(y_test, y_pred)

print(cm)
[[4374  537]
 [ 943 1104]]
In [ ]:
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=model.classes_)
disp.plot()
plt.show()
No description has been provided for this image
In [ ]:
threshold = 0.45
y_pred = (model.predict_proba(X_test[numerical_cols])[:, 1] > threshold).astype('float')
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=model.classes_)
disp.plot()
plt.show()
No description has been provided for this image
In [ ]:
threshold = 0.42
y_pred = (model.predict_proba(X_test[numerical_cols])[:, 1] > threshold).astype('float')
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=model.classes_)
disp.plot()
plt.show()
No description has been provided for this image
In [ ]:
# ROC

from sklearn.metrics import roc_auc_score, roc_curve, accuracy_score
import matplotlib.pyplot as plt
import numpy as np

yPred = y_pred
yScore = result['Prob1']
yTest = result['TGT']
areaBajoCurvaRoc = roc_auc_score(yTest, yScore)
accuracy = accuracy_score(yTest, yPred)
fpr, tpr, _ = roc_curve(yTest, yScore)
plt.plot(fpr, tpr)
plt.plot([0,1])
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.annotate('Area bajo la curva ROC : {}'.format(areaBajoCurvaRoc), (-0.02,0.99))
plt.annotate('Accuracy : {}'.format(accuracy), (-0.02,0.94))
plt.show()
display()
No description has been provided for this image
In [ ]:
pVar = 'Prob1'
result['porc'] = result[pVar].rank(pct=True) * 100

len(probabilities[:,1])

result.loc[result['porc'].between(0, 10, inclusive='neither'), 'decil'] = '10'
result.loc[result['porc'].between(10, 20, inclusive='both'), 'decil'] = '9'
result.loc[result['porc'].between(20, 30, inclusive='neither'), 'decil'] = '8'
result.loc[result['porc'].between(30, 40, inclusive='both'), 'decil'] = '7'
result.loc[result['porc'].between(40, 50, inclusive='neither'), 'decil'] = '6'
result.loc[result['porc'].between(50, 60, inclusive='both'), 'decil'] = '5'
result.loc[result['porc'].between(60, 70, inclusive='neither'), 'decil'] = '4'
result.loc[result['porc'].between(70, 80, inclusive='both'), 'decil'] = '3'
result.loc[result['porc'].between(80, 90, inclusive='neither'), 'decil'] = '2'
result.loc[result['porc'].between(90, 101, inclusive='both'), 'decil'] = '1'

a = pd.DataFrame(result.decil.value_counts().reset_index())
a.columns = ['index','decil']

b = pd.DataFrame(result[result.TGT == 1].decil.value_counts().reset_index())

b.columns = ['index','decil']
In [ ]:
b
Out[ ]:
index decil
0 1 531
1 2 450
2 3 342
3 4 283
4 5 178
5 6 143
6 7 86
7 8 27
8 9 4
9 10 3
In [ ]:
c = a.merge(b, how='left', on='index')
c['TGT_%'] = (c['decil_y'] /c['decil_x'])*100
c
Out[ ]:
index decil_x decil_y TGT_%
0 10 742 3 0.404313
1 8 697 27 3.873745
2 4 696 283 40.660920
3 1 696 531 76.293103
4 2 696 450 64.655172
5 3 696 342 49.137931
6 5 696 178 25.574713
7 6 695 143 20.575540
8 7 695 86 12.374101
9 9 649 4 0.616333